package pl.edu.icm.cermine;

import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import pl.edu.icm.cermine.bibref.model.BibEntry;
import pl.edu.icm.cermine.configuration.ExtractionConfigProperty;
import pl.edu.icm.cermine.configuration.ExtractionConfigRegister;
import pl.edu.icm.cermine.content.citations.ContentStructureCitationPositions;
import pl.edu.icm.cermine.content.model.BxContentStructure;
import pl.edu.icm.cermine.content.model.ContentStructure;
import pl.edu.icm.cermine.content.transformers.BxContentToDocContentConverter;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.metadata.model.DocumentAffiliation;
import pl.edu.icm.cermine.metadata.model.DocumentMetadata;
import pl.edu.icm.cermine.parsing.tools.ParsableStringParser;
import pl.edu.icm.cermine.structure.model.BxDocument;
import pl.edu.icm.cermine.tools.timeout.TimeoutRegister;

/* loaded from: input_file:pl/edu/icm/cermine/ExtractionUtils.class */
public class ExtractionUtils {

    /* loaded from: input_file:pl/edu/icm/cermine/ExtractionUtils$Step.class */
    public enum Step {
        CHARACTER_EXTRACTION(null),
        PAGE_SEGMENTATION(setOf(CHARACTER_EXTRACTION)),
        READING_ORDER(setOf(PAGE_SEGMENTATION)),
        INITIAL_CLASSIFICATION(setOf(READING_ORDER)),
        METADATA_CLASSIFICATION(setOf(INITIAL_CLASSIFICATION)),
        METADATA_CLEANING(setOf(METADATA_CLASSIFICATION)),
        AFFILIATION_PARSING(setOf(METADATA_CLEANING)),
        REFERENCE_EXTRACTION(setOf(INITIAL_CLASSIFICATION)),
        REFERENCE_PARSING(setOf(REFERENCE_EXTRACTION)),
        CONTENT_FILTERING(setOf(INITIAL_CLASSIFICATION)),
        HEADER_DETECTION(setOf(CONTENT_FILTERING)),
        TOC_EXTRACTION(setOf(HEADER_DETECTION)),
        CONTENT_CLEANING(setOf(TOC_EXTRACTION)),
        CITPOS_DETECTION(setOf(CONTENT_CLEANING, REFERENCE_PARSING));

        private Set<Step> prerequisites;

        Step(Set set) {
            this.prerequisites = set;
        }

        public Set<Step> getPrerequisites() {
            return this.prerequisites;
        }

        private static Set<Step> setOf(Step... stepArr) {
            return Sets.newHashSet(stepArr);
        }

        static {
            for (Step step : values()) {
                if (step.prerequisites == null) {
                    step.prerequisites = EnumSet.noneOf(Step.class);
                } else {
                    step.prerequisites = EnumSet.copyOf((Collection) step.prerequisites);
                }
            }
        }
    }

    private static void debug(double d, String str) {
        if (ExtractionConfigRegister.get().getBooleanProperty(ExtractionConfigProperty.DEBUG_PRINT_TIME)) {
            System.out.println(str + ": " + ((System.currentTimeMillis() - d) / 1000.0d));
        }
    }

    public static BxDocument extractCharacters(ComponentConfiguration componentConfiguration, InputStream inputStream) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BxDocument extractCharacters = componentConfiguration.getCharacterExtractor().extractCharacters(inputStream);
        debug(currentTimeMillis, "1.1 Character extraction");
        return extractCharacters;
    }

    public static BxDocument segmentPages(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        TimeoutRegister.get().check();
        BxDocument segmentDocument = componentConfiguration.getDocumentSegmenter().segmentDocument(bxDocument);
        debug(currentTimeMillis, "1.2 Page segmentation");
        return segmentDocument;
    }

    public static BxDocument resolveReadingOrder(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BxDocument resolve = componentConfiguration.getReadingOrderResolver().resolve(bxDocument);
        debug(currentTimeMillis, "1.3 Reading order resolving");
        return resolve;
    }

    public static BxDocument classifyInitially(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BxDocument classifyZones = componentConfiguration.getInitialClassifier().classifyZones(bxDocument);
        debug(currentTimeMillis, "1.4 Initial classification");
        return classifyZones;
    }

    public static BxDocument classifyMetadata(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BxDocument classifyZones = componentConfiguration.getMetadataClassifier().classifyZones(bxDocument);
        debug(currentTimeMillis, "2.1 Metadata classification");
        return classifyZones;
    }

    public static DocumentMetadata cleanMetadata(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        DocumentMetadata extractMetadata = componentConfiguration.getMetadataExtractor().extractMetadata(bxDocument);
        debug(currentTimeMillis, "2.2 Metadata cleaning");
        return extractMetadata;
    }

    public static DocumentMetadata parseAffiliations(ComponentConfiguration componentConfiguration, DocumentMetadata documentMetadata) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        Iterator<DocumentAffiliation> it = documentMetadata.getAffiliations().iterator();
        while (it.hasNext()) {
            componentConfiguration.getAffiliationParser().parse((ParsableStringParser<DocumentAffiliation>) it.next());
        }
        debug(currentTimeMillis, "2.3 Affiliation parsing");
        return documentMetadata;
    }

    public static List<String> extractRefStrings(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        ArrayList newArrayList = Lists.newArrayList(componentConfiguration.getBibRefExtractor().extractBibReferences(bxDocument));
        debug(currentTimeMillis, "3.1 Reference extraction");
        return newArrayList;
    }

    public static List<BibEntry> parseReferences(ComponentConfiguration componentConfiguration, List<String> list) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            arrayList.add(componentConfiguration.getBibRefParser().parseBibReference(it.next()));
        }
        debug(currentTimeMillis, "3.2 Reference parsing");
        return arrayList;
    }

    public static BxDocument filterContent(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BxDocument filter = componentConfiguration.getContentFilter().filter(bxDocument);
        debug(currentTimeMillis, "4.1 Content filtering");
        return filter;
    }

    public static BxContentStructure extractHeaders(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BxContentStructure extractHeaders = componentConfiguration.getContentHeaderExtractor().extractHeaders(bxDocument);
        debug(currentTimeMillis, "4.2 Headers extraction");
        return extractHeaders;
    }

    public static BxContentStructure clusterHeaders(ComponentConfiguration componentConfiguration, BxContentStructure bxContentStructure) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        componentConfiguration.getContentHeaderClusterizer().clusterHeaders(bxContentStructure);
        debug(currentTimeMillis, "4.3 Headers clustering");
        return bxContentStructure;
    }

    public static ContentStructure cleanStructure(ComponentConfiguration componentConfiguration, BxContentStructure bxContentStructure) throws AnalysisException {
        try {
            long currentTimeMillis = System.currentTimeMillis();
            componentConfiguration.getContentCleaner().cleanupContent(bxContentStructure);
            ContentStructure convert = new BxContentToDocContentConverter().convert(bxContentStructure, new Object[0]);
            debug(currentTimeMillis, "4.4 Content cleaning");
            return convert;
        } catch (TransformationException e) {
            throw new AnalysisException(e);
        }
    }

    public static ContentStructureCitationPositions findCitationPositions(ComponentConfiguration componentConfiguration, ContentStructure contentStructure, List<BibEntry> list) {
        long currentTimeMillis = System.currentTimeMillis();
        ContentStructureCitationPositions findReferences = componentConfiguration.getCitationPositionFinder().findReferences(contentStructure, list);
        debug(currentTimeMillis, "4.5 Citation positions finding");
        return findReferences;
    }
}
