package pl.edu.icm.cermine.content.headers;

import java.util.List;
import pl.edu.icm.cermine.content.model.BxDocContentStructure;
import pl.edu.icm.cermine.content.model.DocumentContentStructure;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.structure.model.BxDocument;
import pl.edu.icm.cermine.structure.model.BxLine;
import pl.edu.icm.cermine.structure.model.BxPage;
import pl.edu.icm.cermine.structure.model.BxZone;
import pl.edu.icm.cermine.structure.model.BxZoneLabel;
import pl.edu.icm.cermine.structure.model.BxZoneLabelCategory;
import pl.edu.icm.cermine.tools.classification.features.FeatureVector;
import pl.edu.icm.cermine.tools.classification.features.FeatureVectorBuilder;
import pl.edu.icm.cermine.tools.classification.general.TrainingSample;
import pl.edu.icm.cermine.tools.classification.knn.KnnClassifier;
import pl.edu.icm.cermine.tools.classification.knn.KnnModel;
import pl.edu.icm.cermine.tools.classification.metrics.FeatureVectorEuclideanMetric;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.2-SNAPSHOT.jar:pl/edu/icm/cermine/content/headers/KnnContentHeadersExtractor.class */
public class KnnContentHeadersExtractor implements ContentHeadersExtractor {
    public static final int DEFAULT_KNN_VOTERS = 3;
    private KnnModel<BxZoneLabel> model;
    private KnnClassifier<BxZoneLabel> classifier;
    private int knnVoters = 3;
    private FeatureVectorBuilder<BxLine, BxPage> classVectorBuilder = HeaderExtractingTools.EXTRACT_VB;
    private HeadersClusterizer headersClusterizer = new HeadersClusterizer();
    private HeaderLinesCompletener headerLinesCompletener = new HeaderLinesCompletener();

    public KnnContentHeadersExtractor(KnnModel<BxZoneLabel> knnModel, KnnClassifier<BxZoneLabel> knnClassifier) {
        this.model = knnModel;
        this.classifier = knnClassifier;
    }

    private boolean isHeader(BxLine bxLine, BxPage bxPage) {
        return this.classifier.classify(this.model, new FeatureVectorEuclideanMetric(), this.classVectorBuilder.getFeatureVector(bxLine, bxPage), this.knnVoters).equals(BxZoneLabel.BODY_HEADING);
    }

    @Override // pl.edu.icm.cermine.content.headers.ContentHeadersExtractor
    public BxDocContentStructure extractHeaders(BxDocument bxDocument) throws AnalysisException {
        BxDocContentStructure bxDocContentStructure = new BxDocContentStructure();
        BxLine bxLine = null;
        for (BxPage bxPage : bxDocument.getPages()) {
            for (BxZone bxZone : bxPage.getZones()) {
                if (bxZone.getLabel().isOfCategoryOrGeneral(BxZoneLabelCategory.CAT_BODY)) {
                    for (BxLine bxLine2 : bxZone.getLines()) {
                        if (isHeader(bxLine2, bxPage)) {
                            bxDocContentStructure.addFirstHeaderLine(bxPage, bxLine2);
                            bxLine = bxLine2;
                        } else if (bxZone.getLabel().equals(BxZoneLabel.BODY_CONTENT) || bxZone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
                            bxDocContentStructure.addContentLine(bxLine, bxLine2);
                        }
                    }
                }
            }
        }
        this.headersClusterizer.clusterHeaders(bxDocContentStructure);
        this.headerLinesCompletener.completeLines(bxDocContentStructure);
        return bxDocContentStructure;
    }

    public static KnnModel<BxZoneLabel> buildModel(FeatureVectorBuilder<BxLine, BxPage> featureVectorBuilder, List<BxDocument> list, List<DocumentContentStructure> list2) {
        KnnModel<BxZoneLabel> knnModel = new KnnModel<>();
        for (int i = 0; i < Math.min(list.size(), list2.size()); i++) {
            BxDocument bxDocument = list.get(i);
            DocumentContentStructure documentContentStructure = list2.get(i);
            for (BxPage bxPage : bxDocument.getPages()) {
                for (BxZone bxZone : bxPage.getZones()) {
                    if (bxZone.getLabel().isOfCategoryOrGeneral(BxZoneLabelCategory.CAT_BODY) && (bxZone.getLabel().equals(BxZoneLabel.GEN_BODY) || bxZone.getLabel().equals(BxZoneLabel.BODY_CONTENT) || bxZone.getLabel().equals(BxZoneLabel.BODY_HEADING))) {
                        for (BxLine bxLine : bxZone.getLines()) {
                            FeatureVector featureVector = featureVectorBuilder.getFeatureVector(bxLine, bxPage);
                            if (documentContentStructure.containsHeaderFirstLineText(bxLine.toText())) {
                                knnModel.addTrainingSample(new TrainingSample<>(featureVector, BxZoneLabel.BODY_HEADING));
                            } else {
                                knnModel.addTrainingSample(new TrainingSample<>(featureVector, BxZoneLabel.BODY_CONTENT));
                            }
                        }
                    }
                }
            }
        }
        return knnModel;
    }

    public static KnnModel<BxZoneLabel> buildModel(List<BxDocument> list, List<DocumentContentStructure> list2) {
        return buildModel(HeaderExtractingTools.EXTRACT_VB, list, list2);
    }
}
