package pl.edu.icm.cermine.bibref;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import pl.edu.icm.cermine.bibref.extraction.features.PrevEndsWithDotFeature;
import pl.edu.icm.cermine.bibref.extraction.features.PrevRelativeLengthFeature;
import pl.edu.icm.cermine.bibref.extraction.features.RelativeStartTresholdFeature;
import pl.edu.icm.cermine.bibref.extraction.features.SpaceBetweenLinesFeature;
import pl.edu.icm.cermine.bibref.extraction.features.StartsWithNumberFeature;
import pl.edu.icm.cermine.bibref.extraction.model.BxDocumentBibReferences;
import pl.edu.icm.cermine.bibref.extraction.tools.BibRefExtractionUtils;
import pl.edu.icm.cermine.content.cleaning.ContentCleaner;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.structure.model.BxDocument;
import pl.edu.icm.cermine.structure.model.BxLine;
import pl.edu.icm.cermine.tools.classification.clustering.KMeansWithInitialCentroids;
import pl.edu.icm.cermine.tools.classification.features.FeatureVector;
import pl.edu.icm.cermine.tools.classification.features.FeatureVectorBuilder;
import pl.edu.icm.cermine.tools.classification.metrics.FeatureVectorEuclideanMetric;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.5.jar:pl/edu/icm/cermine/bibref/KMeansBibReferenceExtractor.class */
public class KMeansBibReferenceExtractor implements BibReferenceExtractor {
    public static final int MAX_REF_LINES_COUNT = 10000;
    public static final int MAX_REFS_COUNT = 1000;
    public static final int MAX_REF_LENGTH = 1500;
    private static final FeatureVectorBuilder<BxLine, BxDocumentBibReferences> VECTOR_BUILDER = new FeatureVectorBuilder<>();

    /* JADX WARN: Multi-variable type inference failed */
    @Override // pl.edu.icm.cermine.bibref.BibReferenceExtractor
    public String[] extractBibReferences(BxDocument bxDocument) throws AnalysisException {
        String str;
        BxDocumentBibReferences extractBibRefLines = BibRefExtractionUtils.extractBibRefLines(bxDocument);
        extractBibRefLines.limit(10000);
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        FeatureVectorEuclideanMetric featureVectorEuclideanMetric = new FeatureVectorEuclideanMetric();
        FeatureVector featureVector = null;
        double d = 0.0d;
        for (BxLine bxLine : extractBibRefLines.getLines()) {
            arrayList.add(ContentCleaner.clean(bxLine.toText()));
            FeatureVector featureVector2 = VECTOR_BUILDER.getFeatureVector(bxLine, extractBibRefLines);
            arrayList2.add(featureVector2);
            if (featureVector == null) {
                featureVector = (FeatureVector) arrayList2.get(0);
            }
            double distance = featureVectorEuclideanMetric.getDistance((FeatureVector) arrayList2.get(0), featureVector2);
            if (distance > d) {
                featureVector = featureVector2;
                d = distance;
            }
        }
        if (arrayList.size() <= 1 || d < 0.001d) {
            return arrayList.size() > 1000 ? new String[0] : (String[]) arrayList.toArray(new String[arrayList.size()]);
        }
        KMeansWithInitialCentroids kMeansWithInitialCentroids = new KMeansWithInitialCentroids(2);
        kMeansWithInitialCentroids.setCentroids(new FeatureVector[]{(FeatureVector) arrayList2.get(0), featureVector});
        List<FeatureVector>[] cluster = kMeansWithInitialCentroids.cluster(arrayList2);
        Object[] objArr = cluster[1].contains(arrayList2.get(0));
        ArrayList arrayList3 = new ArrayList();
        String str2 = "";
        for (int i = 0; i < arrayList.size(); i++) {
            if (cluster[objArr == true ? 1 : 0].contains(arrayList2.get(i))) {
                if (!str2.isEmpty() && str2.matches(".*[0-9].*") && str2.matches(".*[a-zA-Z].*") && str2.length() < 1500) {
                    arrayList3.add(str2);
                }
                str = (String) arrayList.get(i);
            } else {
                str = (str2.matches(new StringBuilder().append(".*[a-zA-Z][").append("-\u00ad‐‑‒–—―⁻₋−-").append("]").toString()) ? str2.substring(0, str2.length() - 1) : str2 + " ") + ((String) arrayList.get(i));
            }
            str2 = str;
        }
        if (!str2.isEmpty() && str2.matches(".*[0-9].*") && str2.matches(".*[a-zA-Z].*") && str2.length() < 1500) {
            arrayList3.add(str2);
        }
        if (arrayList3.size() > 1000) {
            arrayList3.clear();
        }
        return (String[]) arrayList3.toArray(new String[arrayList3.size()]);
    }

    static {
        VECTOR_BUILDER.setFeatureCalculators(Arrays.asList(new PrevEndsWithDotFeature(), new PrevRelativeLengthFeature(), new RelativeStartTresholdFeature(), new SpaceBetweenLinesFeature(), new StartsWithNumberFeature()));
    }
}
