package pl.edu.icm.yadda.analysis.articlecontent;

import java.util.Iterator;
import java.util.List;
import pl.edu.icm.yadda.analysis.AnalysisException;
import pl.edu.icm.yadda.analysis.classification.features.FeatureVector;
import pl.edu.icm.yadda.analysis.classification.features.FeatureVectorBuilder;
import pl.edu.icm.yadda.analysis.classification.knn.classifier.KnnClassifier;
import pl.edu.icm.yadda.analysis.classification.knn.model.KnnModel;
import pl.edu.icm.yadda.analysis.classification.knn.model.KnnTrainingSample;
import pl.edu.icm.yadda.analysis.classification.metrics.FeatureVectorEuclideanMetric;
import pl.edu.icm.yadda.analysis.textr.model.BxDocument;
import pl.edu.icm.yadda.analysis.textr.model.BxPage;
import pl.edu.icm.yadda.analysis.textr.model.BxZone;
import pl.edu.icm.yadda.analysis.textr.model.BxZoneLabel;
import pl.edu.icm.yadda.analysis.textr.model.BxZoneLabelCategory;

/* loaded from: input_file:WEB-INF/lib/yadda-analysis-impl-1.11.6.jar:pl/edu/icm/yadda/analysis/articlecontent/ContentJunkFilter.class */
public class ContentJunkFilter {
    private int knnVoters = 3;

    public BxDocument filterJunk(KnnModel<BxZoneLabel> knnModel, FeatureVectorBuilder<BxZone, BxPage> featureVectorBuilder, BxDocument bxDocument) throws AnalysisException {
        KnnClassifier knnClassifier = new KnnClassifier();
        for (BxPage bxPage : bxDocument.getPages()) {
            for (BxZone bxZone : bxPage.getZones()) {
                if (bxZone.getLabel().isOfCategoryOrGeneral(BxZoneLabelCategory.CAT_BODY)) {
                    bxZone.setLabel((BxZoneLabel) knnClassifier.classify(knnModel, new FeatureVectorEuclideanMetric(), featureVectorBuilder.getFeatureVector(bxZone, bxPage), this.knnVoters));
                }
            }
        }
        return bxDocument;
    }

    public KnnModel<BxZoneLabel> buildModel(FeatureVectorBuilder<BxZone, BxPage> featureVectorBuilder, List<BxDocument> list) {
        KnnModel<BxZoneLabel> knnModel = new KnnModel<>();
        Iterator<BxDocument> it = list.iterator();
        while (it.hasNext()) {
            for (BxPage bxPage : it.next().getPages()) {
                for (BxZone bxZone : bxPage.getZones()) {
                    if (bxZone.getLabel().isOfCategoryOrGeneral(BxZoneLabelCategory.CAT_BODY)) {
                        FeatureVector featureVector = featureVectorBuilder.getFeatureVector(bxZone, bxPage);
                        if (bxZone.getLabel().equals(BxZoneLabel.BODY_JUNK) || bxZone.getLabel().equals(BxZoneLabel.BODY_EQUATION) || bxZone.getLabel().equals(BxZoneLabel.BODY_EQUATION_LABEL) || bxZone.getLabel().equals(BxZoneLabel.BODY_FIGURE) || bxZone.getLabel().equals(BxZoneLabel.BODY_FIGURE_CAPTION) || bxZone.getLabel().equals(BxZoneLabel.BODY_TABLE) || bxZone.getLabel().equals(BxZoneLabel.BODY_TABLE_CAPTION)) {
                            knnModel.addTrainingSample(new KnnTrainingSample<>(featureVector, BxZoneLabel.BODY_JUNK));
                        } else {
                            knnModel.addTrainingSample(new KnnTrainingSample<>(featureVector, BxZoneLabel.BODY_CONTENT));
                        }
                    }
                }
            }
        }
        return knnModel;
    }
}
