package pl.edu.icm.cermine;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.jdom.Element;
import pl.edu.icm.cermine.bibref.model.BibEntry;
import pl.edu.icm.cermine.bibref.sentiment.model.CitationPosition;
import pl.edu.icm.cermine.bibref.sentiment.model.CitationSentiment;
import pl.edu.icm.cermine.bibref.transformers.BibEntryToNLMElementConverter;
import pl.edu.icm.cermine.content.RawTextWithLabelsExtractor;
import pl.edu.icm.cermine.content.citations.ContentCitationPositionFinder;
import pl.edu.icm.cermine.content.cleaning.ContentCleaner;
import pl.edu.icm.cermine.content.model.BxContentStructure;
import pl.edu.icm.cermine.content.model.ContentStructure;
import pl.edu.icm.cermine.content.transformers.BxContentStructToDocContentStructConverter;
import pl.edu.icm.cermine.content.transformers.DocContentStructToNLMElementConverter;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.metadata.model.DocumentAffiliation;
import pl.edu.icm.cermine.metadata.model.DocumentMetadata;
import pl.edu.icm.cermine.metadata.transformers.DocumentMetadataToNLMElementConverter;
import pl.edu.icm.cermine.parsing.tools.ParsableStringParser;
import pl.edu.icm.cermine.structure.model.BxDocument;
import pl.edu.icm.cermine.tools.timeout.TimeoutRegister;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.9-SNAPSHOT.jar:pl/edu/icm/cermine/ExtractionUtils.class */
public class ExtractionUtils {
    public static BxDocument extractStructure(ComponentConfiguration componentConfiguration, InputStream inputStream) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BxDocument extractCharacters = extractCharacters(componentConfiguration, inputStream);
        TimeoutRegister.get().check();
        BxDocument classifyInitially = classifyInitially(componentConfiguration, resolveReadingOrder(componentConfiguration, segmentPages(componentConfiguration, extractCharacters)));
        if (componentConfiguration.timeDebug) {
            System.out.println("1. Structure extraction: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return classifyInitially;
    }

    public static DocumentMetadata extractMetadata(ComponentConfiguration componentConfiguration, InputStream inputStream) throws AnalysisException {
        return extractMetadata(componentConfiguration, extractStructure(componentConfiguration, inputStream));
    }

    public static Element extractMetadataAsNLM(ComponentConfiguration componentConfiguration, InputStream inputStream) throws AnalysisException {
        try {
            return new DocumentMetadataToNLMElementConverter().convert(extractMetadata(componentConfiguration, inputStream), new Object[0]);
        } catch (TransformationException e) {
            throw new AnalysisException("Cannot extract metadata from the document!", e);
        }
    }

    public static DocumentMetadata extractMetadata(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        DocumentMetadata parseAffiliations = parseAffiliations(componentConfiguration, cleanMetadata(componentConfiguration, classifyMetadata(componentConfiguration, bxDocument)));
        if (componentConfiguration.timeDebug) {
            System.out.println("2. Metadata extraction: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return parseAffiliations;
    }

    public static Element extractMetadataAsNLM(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        try {
            return new DocumentMetadataToNLMElementConverter().convert(extractMetadata(componentConfiguration, bxDocument), new Object[0]);
        } catch (TransformationException e) {
            throw new AnalysisException("Cannot extract metadata from the document!", e);
        }
    }

    public static String extractRawText(ComponentConfiguration componentConfiguration, InputStream inputStream) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        String extractRawText = extractRawText(componentConfiguration, resolveReadingOrder(componentConfiguration, segmentPages(componentConfiguration, extractCharacters(componentConfiguration, inputStream))));
        if (componentConfiguration.timeDebug) {
            System.out.println("Raw text extraction: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return extractRawText;
    }

    public static String extractRawText(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        return ContentCleaner.cleanAll(bxDocument.toText());
    }

    public static BibEntry[] extractReferences(ComponentConfiguration componentConfiguration, InputStream inputStream) throws AnalysisException {
        return extractReferences(componentConfiguration, extractStructure(componentConfiguration, inputStream));
    }

    public static BibEntry[] extractReferences(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BibEntry[] parseReferences = parseReferences(componentConfiguration, extractRefStrings(componentConfiguration, bxDocument));
        if (componentConfiguration.timeDebug) {
            System.out.println("3. References extraction: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return parseReferences;
    }

    public static Element[] extractReferencesAsNLM(ComponentConfiguration componentConfiguration, InputStream inputStream) throws AnalysisException {
        return convertReferences(extractReferences(componentConfiguration, inputStream));
    }

    public static Element[] extractReferencesAsNLM(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        return convertReferences(extractReferences(componentConfiguration, bxDocument));
    }

    public static Element[] convertReferences(BibEntry[] bibEntryArr) throws AnalysisException {
        ArrayList arrayList = new ArrayList(bibEntryArr.length);
        BibEntryToNLMElementConverter bibEntryToNLMElementConverter = new BibEntryToNLMElementConverter();
        for (BibEntry bibEntry : bibEntryArr) {
            try {
                arrayList.add(bibEntryToNLMElementConverter.convert(bibEntry, new Object[0]));
            } catch (TransformationException e) {
                throw new AnalysisException("Cannot convert references!", e);
            }
        }
        return (Element[]) arrayList.toArray(new Element[bibEntryArr.length]);
    }

    public static Element extractTextAsNLM(ComponentConfiguration componentConfiguration, InputStream inputStream) throws AnalysisException {
        return extractTextAsNLM(componentConfiguration, extractStructure(componentConfiguration, inputStream), null);
    }

    public static Element extractTextAsNLM(ComponentConfiguration componentConfiguration, BxDocument bxDocument, List<BibEntry> list) throws AnalysisException {
        try {
            DocContentStructToNLMElementConverter docContentStructToNLMElementConverter = new DocContentStructToNLMElementConverter();
            ContentStructure extractText = extractText(componentConfiguration, bxDocument);
            if (list == null) {
                list = Arrays.asList(extractReferences(componentConfiguration, bxDocument));
            }
            return docContentStructToNLMElementConverter.convert((DocContentStructToNLMElementConverter) extractText, new ContentCitationPositionFinder().findReferences(extractText, list));
        } catch (TransformationException e) {
            throw new AnalysisException("Cannot extract text from document!", e);
        }
    }

    public static ContentStructure extractText(ComponentConfiguration componentConfiguration, InputStream inputStream) throws AnalysisException {
        return extractText(componentConfiguration, extractStructure(componentConfiguration, inputStream));
    }

    public static ContentStructure extractText(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        try {
            long currentTimeMillis = System.currentTimeMillis();
            BxContentStructure clusterHeaders = clusterHeaders(componentConfiguration, extractHeaders(componentConfiguration, filterContent(componentConfiguration, bxDocument)));
            componentConfiguration.contentCleaner.cleanupContent(clusterHeaders);
            ContentStructure convert = new BxContentStructToDocContentStructConverter().convert(clusterHeaders, new Object[0]);
            if (componentConfiguration.timeDebug) {
                System.out.println("4. Body extraction: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
            }
            return convert;
        } catch (TransformationException e) {
            throw new AnalysisException("Cannot extract content from the document!", e);
        }
    }

    public static Element extractRawTextWithLabels(ComponentConfiguration componentConfiguration, InputStream inputStream) throws AnalysisException {
        return extractRawTextWithLabels(componentConfiguration, extractStructure(componentConfiguration, inputStream));
    }

    public static Element extractRawTextWithLabels(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        Element extractRawTextWithLabels = new RawTextWithLabelsExtractor().extractRawTextWithLabels(bxDocument, clusterHeaders(componentConfiguration, extractHeaders(componentConfiguration, filterContent(componentConfiguration, classifyMetadata(componentConfiguration, bxDocument)))));
        if (componentConfiguration.timeDebug) {
            System.out.println("Raw text with labels extraction: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return extractRawTextWithLabels;
    }

    public static BxDocument extractCharacters(ComponentConfiguration componentConfiguration, InputStream inputStream) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BxDocument extractCharacters = componentConfiguration.characterExtractor.extractCharacters(inputStream);
        if (componentConfiguration.timeDebug) {
            System.out.println("1.1 Character extraction: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return extractCharacters;
    }

    public static BxDocument segmentPages(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        TimeoutRegister.get().check();
        BxDocument segmentDocument = componentConfiguration.documentSegmenter.segmentDocument(bxDocument);
        if (componentConfiguration.timeDebug) {
            System.out.println("1.2 Page segmentation: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return segmentDocument;
    }

    public static BxDocument resolveReadingOrder(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BxDocument resolve = componentConfiguration.readingOrderResolver.resolve(bxDocument);
        if (componentConfiguration.timeDebug) {
            System.out.println("1.3 Reading order resolving: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return resolve;
    }

    public static BxDocument classifyInitially(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BxDocument classifyZones = componentConfiguration.initialClassifier.classifyZones(bxDocument);
        if (componentConfiguration.timeDebug) {
            System.out.println("1.4 Initial classification: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return classifyZones;
    }

    public static BxDocument classifyMetadata(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BxDocument classifyZones = componentConfiguration.metadataClassifier.classifyZones(bxDocument);
        if (componentConfiguration.timeDebug) {
            System.out.println("2.1 Metadata classification: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return classifyZones;
    }

    public static DocumentMetadata cleanMetadata(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        DocumentMetadata extractMetadata = componentConfiguration.metadataExtractor.extractMetadata(bxDocument);
        if (componentConfiguration.timeDebug) {
            System.out.println("2.2 Metadata cleaning: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return extractMetadata;
    }

    public static DocumentMetadata parseAffiliations(ComponentConfiguration componentConfiguration, DocumentMetadata documentMetadata) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        Iterator<DocumentAffiliation> it = documentMetadata.getAffiliations().iterator();
        while (it.hasNext()) {
            componentConfiguration.affiliationParser.parse((ParsableStringParser<DocumentAffiliation>) it.next());
        }
        if (componentConfiguration.timeDebug) {
            System.out.println("2.3 Affiliation parsing: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return documentMetadata;
    }

    public static String[] extractRefStrings(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        String[] extractBibReferences = componentConfiguration.bibReferenceExtractor.extractBibReferences(bxDocument);
        if (componentConfiguration.timeDebug) {
            System.out.println("3.1 Reference extraction: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return extractBibReferences;
    }

    public static List<CitationSentiment> analyzeSentiment(ComponentConfiguration componentConfiguration, String str, List<BibEntry> list) {
        return analyzeSentimentFromPositions(componentConfiguration, str, findCitationPositions(componentConfiguration, str, list));
    }

    public static List<CitationSentiment> analyzeSentimentFromPositions(ComponentConfiguration componentConfiguration, String str, List<List<CitationPosition>> list) {
        ArrayList arrayList = new ArrayList(list.size());
        Iterator<List<String>> it = componentConfiguration.citationContextFinder.findContext(str, list).iterator();
        while (it.hasNext()) {
            arrayList.add(componentConfiguration.citationSentimentAnalyser.analyzeSentiment(it.next()));
        }
        return arrayList;
    }

    public static List<List<CitationPosition>> findCitationPositions(ComponentConfiguration componentConfiguration, String str, List<BibEntry> list) {
        return componentConfiguration.citationPositionFinder.findReferences(str, list);
    }

    public static BibEntry[] parseReferences(ComponentConfiguration componentConfiguration, String[] strArr) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BibEntry[] bibEntryArr = new BibEntry[strArr.length];
        for (int i = 0; i < strArr.length; i++) {
            bibEntryArr[i] = componentConfiguration.bibReferenceParser.parseBibReference(strArr[i]);
        }
        if (componentConfiguration.timeDebug) {
            System.out.println("3.2 Reference parsing: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return bibEntryArr;
    }

    public static BxDocument filterContent(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BxDocument filter = componentConfiguration.contentFilter.filter(bxDocument);
        if (componentConfiguration.timeDebug) {
            System.out.println("4.1 Content filtering: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return filter;
    }

    public static BxContentStructure extractHeaders(ComponentConfiguration componentConfiguration, BxDocument bxDocument) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        BxContentStructure extractHeaders = componentConfiguration.contentHeaderExtractor.extractHeaders(bxDocument);
        if (componentConfiguration.timeDebug) {
            System.out.println("4.2 Headers extraction: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return extractHeaders;
    }

    public static BxContentStructure clusterHeaders(ComponentConfiguration componentConfiguration, BxContentStructure bxContentStructure) throws AnalysisException {
        long currentTimeMillis = System.currentTimeMillis();
        componentConfiguration.contentHeaderClusterizer.clusterHeaders(bxContentStructure);
        if (componentConfiguration.timeDebug) {
            System.out.println("4.3 Headers clustering: " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d));
        }
        return bxContentStructure;
    }
}
