package pl.edu.icm.cermine.bibref.extraction.tools;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import pl.edu.icm.cermine.bibref.extraction.model.BibReferenceLineLabel;
import pl.edu.icm.cermine.bibref.extraction.model.BxDocumentBibReferences;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.structure.model.BxDocument;
import pl.edu.icm.cermine.structure.model.BxLine;
import pl.edu.icm.cermine.structure.model.BxPage;
import pl.edu.icm.cermine.structure.model.BxZone;
import pl.edu.icm.cermine.structure.model.BxZoneLabelCategory;
import pl.edu.icm.cermine.tools.classification.features.FeatureVectorBuilder;
import pl.edu.icm.cermine.tools.classification.hmm.training.HMMTrainingSample;

/* loaded from: input_file:pl/edu/icm/cermine/bibref/extraction/tools/BibRefExtractionUtils.class */
public final class BibRefExtractionUtils {
    private BibRefExtractionUtils() {
    }

    public static BxDocumentBibReferences extractBibRefLines(BxDocument bxDocument) throws AnalysisException {
        BxDocumentBibReferences bxDocumentBibReferences = new BxDocumentBibReferences();
        Iterator it = bxDocument.getPages().iterator();
        while (it.hasNext()) {
            for (BxZone bxZone : ((BxPage) it.next()).getZones()) {
                if (bxZone.getLabel().isOfCategoryOrGeneral(BxZoneLabelCategory.CAT_REFERENCES)) {
                    bxDocumentBibReferences.addZone(bxZone);
                }
            }
        }
        return bxDocumentBibReferences;
    }

    public static BxDocumentBibReferences extractBibRefLines(BxDocument bxDocument, List<String> list) throws AnalysisException {
        BxDocumentBibReferences extractBibRefLines = extractBibRefLines(bxDocument);
        List<BxLine> lines = extractBibRefLines.getLines();
        int i = 0;
        int i2 = 0;
        String str = null;
        boolean z = true;
        while (i < lines.size() && i2 < list.size()) {
            BxLine bxLine = lines.get(i);
            if (str == null) {
                str = list.get(i2);
            }
            if (str.equals(bxLine.toText())) {
                if (z) {
                    extractBibRefLines.setLabel(bxLine, BibReferenceLineLabel.BIBREF_START);
                } else {
                    extractBibRefLines.setLabel(bxLine, BibReferenceLineLabel.BIBREF_END);
                }
                z = true;
                str = null;
                i++;
                i2++;
            } else if (str.startsWith(bxLine.toText())) {
                if (z) {
                    extractBibRefLines.setLabel(bxLine, BibReferenceLineLabel.BIBREF_START);
                } else {
                    extractBibRefLines.setLabel(bxLine, BibReferenceLineLabel.BIBREF_INNER);
                }
                z = false;
                str = str.substring(bxLine.toText().length() + 1);
                i++;
            } else {
                extractBibRefLines.setLabel(bxLine, BibReferenceLineLabel.BLOCK_LABEL);
                i++;
            }
        }
        while (i < lines.size()) {
            extractBibRefLines.setLabel(lines.get(i), BibReferenceLineLabel.BLOCK_LABEL);
            i++;
        }
        return extractBibRefLines;
    }

    public static String[] groupLinesIntoBibRefs(BxDocumentBibReferences bxDocumentBibReferences, List<BibReferenceLineLabel> list) {
        String str;
        ArrayList arrayList = new ArrayList();
        String str2 = "";
        for (int i = 0; i < bxDocumentBibReferences.getLines().size(); i++) {
            String text = bxDocumentBibReferences.getLines().get(i).toText();
            BibReferenceLineLabel bibReferenceLineLabel = list.get(i);
            if (!bibReferenceLineLabel.equals(BibReferenceLineLabel.BLOCK_LABEL)) {
                if (bibReferenceLineLabel.equals(BibReferenceLineLabel.BIBREF_START)) {
                    if (!str2.isEmpty()) {
                        arrayList.add(str2);
                    }
                    str = "";
                } else {
                    str = str2 + " ";
                }
                str2 = str + text;
            }
        }
        if (!str2.isEmpty()) {
            arrayList.add(str2);
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    public static HMMTrainingSample<BibReferenceLineLabel>[] convertToHMM(BxDocumentBibReferences[] bxDocumentBibReferencesArr, FeatureVectorBuilder<BxLine, BxDocumentBibReferences> featureVectorBuilder) {
        ArrayList arrayList = new ArrayList();
        for (BxDocumentBibReferences bxDocumentBibReferences : bxDocumentBibReferencesArr) {
            HMMTrainingSample hMMTrainingSample = null;
            for (BxLine bxLine : bxDocumentBibReferences.getLines()) {
                HMMTrainingSample hMMTrainingSample2 = new HMMTrainingSample(featureVectorBuilder.getFeatureVector(bxLine, bxDocumentBibReferences), bxDocumentBibReferences.getLabel(bxLine), hMMTrainingSample == null);
                arrayList.add(hMMTrainingSample2);
                if (hMMTrainingSample != null) {
                    hMMTrainingSample.setNextLabel(bxDocumentBibReferences.getLabel(bxLine));
                }
                hMMTrainingSample = hMMTrainingSample2;
            }
        }
        return (HMMTrainingSample[]) arrayList.toArray(new HMMTrainingSample[arrayList.size()]);
    }
}
