package pl.edu.icm.yadda.analysis.articlecontent;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import pl.edu.icm.yadda.analysis.AnalysisException;
import pl.edu.icm.yadda.analysis.articlecontent.model.BxDocContentStructure;
import pl.edu.icm.yadda.analysis.articlecontent.model.DocumentContentStructure;
import pl.edu.icm.yadda.analysis.classification.clustering.FeatureVectorClusterizer;
import pl.edu.icm.yadda.analysis.classification.clustering.SingleLinkageClusterizer;
import pl.edu.icm.yadda.analysis.classification.features.FeatureVector;
import pl.edu.icm.yadda.analysis.classification.features.FeatureVectorBuilder;
import pl.edu.icm.yadda.analysis.classification.knn.classifier.KnnClassifier;
import pl.edu.icm.yadda.analysis.classification.knn.model.KnnModel;
import pl.edu.icm.yadda.analysis.classification.knn.model.KnnTrainingSample;
import pl.edu.icm.yadda.analysis.classification.metrics.FeatureVectorEuclideanMetric;
import pl.edu.icm.yadda.analysis.textr.model.BxDocument;
import pl.edu.icm.yadda.analysis.textr.model.BxLine;
import pl.edu.icm.yadda.analysis.textr.model.BxPage;
import pl.edu.icm.yadda.analysis.textr.model.BxZone;
import pl.edu.icm.yadda.analysis.textr.model.BxZoneLabel;
import pl.edu.icm.yadda.analysis.textr.model.BxZoneLabelCategory;

/* loaded from: input_file:WEB-INF/lib/yadda-analysis-impl-1.11.2.jar:pl/edu/icm/yadda/analysis/articlecontent/ContentHeaderExtractor.class */
public class ContentHeaderExtractor {
    private int knnVoters = 3;
    private int maxAddedHeaderLines = 2;
    private double headerHeightTolerance = 0.01d;
    private int minHeaderCandidateScore = 1;
    private double headerLineWidthMultiplier = 0.7d;
    private double headerLineSpacingMultiplier = 0.7d;
    private int minHeaderLineScore = 1;
    private double maxHeaderLevelDistance = 1.0d;

    public BxDocContentStructure extractHeaders(KnnModel<BxZoneLabel> knnModel, FeatureVectorBuilder<BxLine, BxPage> featureVectorBuilder, FeatureVectorBuilder<BxLine, BxPage> featureVectorBuilder2, BxDocument bxDocument) throws AnalysisException {
        KnnClassifier knnClassifier = new KnnClassifier();
        BxDocContentStructure bxDocContentStructure = new BxDocContentStructure();
        BxLine bxLine = null;
        for (BxPage bxPage : bxDocument.getPages()) {
            for (BxZone bxZone : bxPage.getZones()) {
                if (bxZone.getLabel().equals(BxZoneLabel.BODY_CONTENT)) {
                    for (BxLine bxLine2 : bxZone.getLines()) {
                        if (((BxZoneLabel) knnClassifier.classify(knnModel, new FeatureVectorEuclideanMetric(), featureVectorBuilder.getFeatureVector(bxLine2, bxPage), this.knnVoters)).equals(BxZoneLabel.BODY_HEADER)) {
                            bxDocContentStructure.addFirstHeaderLine(bxPage, bxLine2);
                            bxLine = bxLine2;
                        } else {
                            bxDocContentStructure.addContentLine(bxLine, bxLine2);
                        }
                    }
                }
            }
        }
        completeLines(bxDocContentStructure);
        setLevelIds(bxDocContentStructure, featureVectorBuilder2);
        return bxDocContentStructure;
    }

    public KnnModel<BxZoneLabel> buildModel(FeatureVectorBuilder<BxLine, BxPage> featureVectorBuilder, List<BxDocument> list, List<DocumentContentStructure> list2) {
        KnnModel<BxZoneLabel> knnModel = new KnnModel<>();
        for (int i = 0; i < Math.min(list.size(), list2.size()); i++) {
            BxDocument bxDocument = list.get(i);
            DocumentContentStructure documentContentStructure = list2.get(i);
            for (BxPage bxPage : bxDocument.getPages()) {
                for (BxZone bxZone : bxPage.getZones()) {
                    if (bxZone.getLabel().isOfCategoryOrGeneral(BxZoneLabelCategory.CAT_BODY) && !bxZone.getLabel().equals(BxZoneLabel.BODY_JUNK) && !bxZone.getLabel().equals(BxZoneLabel.BODY_EQUATION) && !bxZone.getLabel().equals(BxZoneLabel.BODY_EQUATION_LABEL) && !bxZone.getLabel().equals(BxZoneLabel.BODY_FIGURE) && !bxZone.getLabel().equals(BxZoneLabel.BODY_FIGURE_CAPTION) && !bxZone.getLabel().equals(BxZoneLabel.BODY_TABLE) && !bxZone.getLabel().equals(BxZoneLabel.BODY_TABLE_CAPTION)) {
                        for (BxLine bxLine : bxZone.getLines()) {
                            FeatureVector featureVector = featureVectorBuilder.getFeatureVector(bxLine, bxPage);
                            if (documentContentStructure.containsHeaderFirstLineText(bxLine.toText())) {
                                knnModel.addTrainingSample(new KnnTrainingSample<>(featureVector, BxZoneLabel.BODY_HEADER));
                            } else {
                                knnModel.addTrainingSample(new KnnTrainingSample<>(featureVector, BxZoneLabel.BODY_CONTENT));
                            }
                        }
                    }
                }
            }
        }
        return knnModel;
    }

    private void setLevelIds(BxDocContentStructure bxDocContentStructure, FeatureVectorBuilder<BxLine, BxPage> featureVectorBuilder) {
        FeatureVectorClusterizer featureVectorClusterizer = new FeatureVectorClusterizer();
        featureVectorClusterizer.setClusterizer(new SingleLinkageClusterizer());
        bxDocContentStructure.setHeaderLevelIds(featureVectorClusterizer.clusterize(bxDocContentStructure.getFirstHeaderFeatureVectors(featureVectorBuilder), featureVectorBuilder, new FeatureVectorEuclideanMetric(), this.maxHeaderLevelDistance, true));
    }

    private void completeLines(BxDocContentStructure bxDocContentStructure) {
        for (BxLine bxLine : bxDocContentStructure.getFirstHeaderLines()) {
            int i = 0;
            BxLine bxLine2 = bxLine;
            ArrayList arrayList = new ArrayList();
            while (bxLine2.hasNext() && i <= this.maxAddedHeaderLines) {
                bxLine2 = bxLine2.getNext();
                if (bxDocContentStructure.containsFirstHeaderLine(bxLine2)) {
                    break;
                }
                if ((Math.abs(bxLine2.getHeight() - bxLine.getHeight()) < this.headerHeightTolerance ? 0 + 1 : 0) < this.minHeaderCandidateScore) {
                    break;
                }
                arrayList.add(bxLine2);
                i++;
            }
            if (i != 0) {
                BxLine bxLine3 = (BxLine) arrayList.get(0);
                BxLine bxLine4 = (BxLine) arrayList.get(i - 1);
                int i2 = bxLine4.getWidth() < bxLine.getWidth() * this.headerLineWidthMultiplier ? 0 + 1 : 0;
                if (bxLine4.hasNext() && Math.abs(bxLine.getY() - bxLine3.getY()) < Math.abs(bxLine4.getY() - bxLine4.getNext().getY()) * this.headerLineSpacingMultiplier) {
                    i2++;
                }
                if (i2 >= this.minHeaderLineScore) {
                    Iterator it = arrayList.iterator();
                    while (it.hasNext()) {
                        bxDocContentStructure.addAdditionalHeaderLine(bxLine, (BxLine) it.next());
                    }
                }
            }
        }
    }
}
