package pl.edu.icm.cermine.content.headers;

import java.util.Arrays;
import java.util.EnumMap;
import java.util.List;
import pl.edu.icm.cermine.content.headers.features.DigitDotSchemaFeature;
import pl.edu.icm.cermine.content.headers.features.DigitParSchemaFeature;
import pl.edu.icm.cermine.content.headers.features.DoubleDigitSchemaFeature;
import pl.edu.icm.cermine.content.headers.features.HeightFeature;
import pl.edu.icm.cermine.content.headers.features.IndentationFeature;
import pl.edu.icm.cermine.content.headers.features.IsHigherThanNeighborsFeature;
import pl.edu.icm.cermine.content.headers.features.LengthFeature;
import pl.edu.icm.cermine.content.headers.features.LowercaseSchemaFeature;
import pl.edu.icm.cermine.content.headers.features.NextLineIndentationFeature;
import pl.edu.icm.cermine.content.headers.features.PrevSpaceFeature;
import pl.edu.icm.cermine.content.headers.features.RomanDigitsSchemaFeature;
import pl.edu.icm.cermine.content.headers.features.TripleDigitSchemaFeature;
import pl.edu.icm.cermine.content.headers.features.UppercaseSchemaFeature;
import pl.edu.icm.cermine.content.headers.features.WordsAllUppercaseFeature;
import pl.edu.icm.cermine.content.headers.features.WordsUppercaseFeature;
import pl.edu.icm.cermine.evaluation.tools.EvaluationUtils;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.structure.model.BxDocument;
import pl.edu.icm.cermine.structure.model.BxLine;
import pl.edu.icm.cermine.structure.model.BxPage;
import pl.edu.icm.cermine.structure.model.BxZoneLabel;
import pl.edu.icm.cermine.structure.model.BxZoneLabelCategory;
import pl.edu.icm.cermine.tools.classification.features.FeatureVectorBuilder;
import pl.edu.icm.cermine.tools.classification.general.BxDocsToTrainingSamplesConverter;
import pl.edu.icm.cermine.tools.classification.general.ClassificationUtils;
import pl.edu.icm.cermine.tools.classification.general.TrainingSample;
import pl.edu.icm.cermine.tools.classification.sampleselection.OversamplingSelector;

/* loaded from: input_file:pl/edu/icm/cermine/content/headers/HeaderExtractingTools.class */
public final class HeaderExtractingTools {
    public static final FeatureVectorBuilder<BxLine, BxPage> EXTRACT_VB = new FeatureVectorBuilder<>();
    public static final FeatureVectorBuilder<BxLine, BxPage> CLUSTERING_VB;

    public static List<TrainingSample<BxZoneLabel>> toTrainingSamples(String str) throws AnalysisException, TransformationException {
        return toTrainingSamples(EvaluationUtils.getDocumentsFromPath(str));
    }

    public static List<TrainingSample<BxZoneLabel>> toTrainingSamples(List<BxDocument> list) throws AnalysisException {
        OversamplingSelector oversamplingSelector = new OversamplingSelector(1.0d);
        EnumMap enumMap = new EnumMap(BxZoneLabel.class);
        enumMap.put((EnumMap) BxZoneLabel.BODY_JUNK, BxZoneLabel.BODY_CONTENT);
        return oversamplingSelector.pickElements(ClassificationUtils.filterElements(BxDocsToTrainingSamplesConverter.getLineTrainingSamples(list, EXTRACT_VB, enumMap), BxZoneLabelCategory.CAT_BODY));
    }

    private HeaderExtractingTools() {
    }

    static {
        EXTRACT_VB.setFeatureCalculators(Arrays.asList(new WordsUppercaseFeature(), new RomanDigitsSchemaFeature(), new TripleDigitSchemaFeature(), new PrevSpaceFeature(), new WordsAllUppercaseFeature(), new HeightFeature(), new IsHigherThanNeighborsFeature(), new NextLineIndentationFeature(), new IndentationFeature(), new DigitParSchemaFeature(), new DoubleDigitSchemaFeature(), new LowercaseSchemaFeature(), new UppercaseSchemaFeature(), new LengthFeature(), new DigitDotSchemaFeature()));
        CLUSTERING_VB = new FeatureVectorBuilder<>();
        CLUSTERING_VB.setFeatureCalculators(Arrays.asList(new DigitDotSchemaFeature(), new DigitParSchemaFeature(), new DoubleDigitSchemaFeature(), new LowercaseSchemaFeature(), new RomanDigitsSchemaFeature(), new TripleDigitSchemaFeature(), new UppercaseSchemaFeature()));
    }
}
