package pl.edu.icm.cermine;

import com.google.common.collect.Lists;
import com.itextpdf.text.html.HtmlTags;
import com.itextpdf.text.xml.xmp.PdfSchema;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.cli.ParseException;
import org.apache.commons.configuration.tree.DefaultExpressionEngine;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.jdom.Element;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import pl.edu.icm.cermine.bibref.model.BibEntry;
import pl.edu.icm.cermine.configuration.ContentExtractorConfig;
import pl.edu.icm.cermine.configuration.ContentExtractorConfigLoader;
import pl.edu.icm.cermine.content.model.ContentStructure;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.metadata.model.DocumentMetadata;
import pl.edu.icm.cermine.structure.model.BxDocument;
import pl.edu.icm.cermine.structure.transformers.BxDocumentToTrueVizWriter;
import pl.edu.icm.cermine.tools.timeout.Timeout;
import pl.edu.icm.cermine.tools.timeout.TimeoutException;
import pl.edu.icm.cermine.tools.timeout.TimeoutRegister;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.11-SNAPSHOT.jar:pl/edu/icm/cermine/ContentExtractor.class */
public class ContentExtractor {
    private final long SECONDS_TO_MILLIS = 1000;
    private final InternalContentExtractor extractor;
    private Timeout mainTimeout;

    public ContentExtractor() throws AnalysisException {
        this(new ContentExtractorConfigLoader().loadConfiguration());
    }

    public ContentExtractor(long j) throws AnalysisException, TimeoutException {
        this(new ContentExtractorConfigLoader().loadConfiguration(), j);
    }

    public ContentExtractor(ContentExtractorConfig contentExtractorConfig) throws AnalysisException {
        this.SECONDS_TO_MILLIS = 1000L;
        this.mainTimeout = new Timeout();
        this.extractor = new InternalContentExtractor(contentExtractorConfig);
    }

    public ContentExtractor(ContentExtractorConfig contentExtractorConfig, long j) throws AnalysisException, TimeoutException {
        this.SECONDS_TO_MILLIS = 1000L;
        this.mainTimeout = new Timeout();
        setTimeout(j);
        try {
            TimeoutRegister.set(this.mainTimeout);
            TimeoutRegister.get().check();
            this.extractor = new InternalContentExtractor(contentExtractorConfig);
        } finally {
            TimeoutRegister.remove();
        }
    }

    public void setTimeout(long j) {
        this.mainTimeout = new Timeout(j * 1000);
    }

    public void removeTimeout() {
        this.mainTimeout = new Timeout();
    }

    public void setPDF(InputStream inputStream) throws IOException {
        this.extractor.setPDF(inputStream);
    }

    public void setBxDocument(BxDocument bxDocument) throws IOException {
        this.extractor.setBxDocument(bxDocument);
    }

    public void reset() throws IOException {
        this.extractor.reset();
    }

    public ComponentConfiguration getConf() {
        return this.extractor.getConf();
    }

    public void setConf(ComponentConfiguration componentConfiguration) {
        this.extractor.setConf(componentConfiguration);
    }

    private BxDocument getBxDocument(Timeout timeout) throws AnalysisException, TimeoutException {
        try {
            TimeoutRegister.set(timeout);
            TimeoutRegister.get().check();
            BxDocument bxDocument = this.extractor.getBxDocument();
            TimeoutRegister.remove();
            return bxDocument;
        } catch (Throwable th) {
            TimeoutRegister.remove();
            throw th;
        }
    }

    public BxDocument getBxDocument() throws AnalysisException, TimeoutException {
        return getBxDocument(this.mainTimeout);
    }

    public BxDocument getBxDocument(long j) throws AnalysisException, TimeoutException {
        return getBxDocument(combineWithMainTimeout(j));
    }

    private BxDocument getBxDocumentWithGeneralLabels(Timeout timeout) throws AnalysisException, TimeoutException {
        try {
            TimeoutRegister.set(timeout);
            TimeoutRegister.get().check();
            BxDocument bxDocumentWithGeneralLabels = this.extractor.getBxDocumentWithGeneralLabels();
            TimeoutRegister.remove();
            return bxDocumentWithGeneralLabels;
        } catch (Throwable th) {
            TimeoutRegister.remove();
            throw th;
        }
    }

    public BxDocument getBxDocumentWithGeneralLabels() throws AnalysisException, TimeoutException {
        return getBxDocumentWithGeneralLabels(this.mainTimeout);
    }

    public BxDocument getBxDocumentWithGeneralLabels(long j) throws AnalysisException, TimeoutException {
        return getBxDocumentWithGeneralLabels(combineWithMainTimeout(j));
    }

    private BxDocument getBxDocumentWithSpecificLabels(Timeout timeout) throws AnalysisException, TimeoutException {
        try {
            TimeoutRegister.set(timeout);
            TimeoutRegister.get().check();
            BxDocument bxDocumentWithSpecificLabels = this.extractor.getBxDocumentWithSpecificLabels();
            TimeoutRegister.remove();
            return bxDocumentWithSpecificLabels;
        } catch (Throwable th) {
            TimeoutRegister.remove();
            throw th;
        }
    }

    public BxDocument getBxDocumentWithSpecificLabels() throws AnalysisException, TimeoutException {
        return getBxDocumentWithSpecificLabels(this.mainTimeout);
    }

    public BxDocument getBxDocumentWithSpecificLabels(long j) throws AnalysisException, TimeoutException {
        return getBxDocumentWithSpecificLabels(combineWithMainTimeout(j));
    }

    private DocumentMetadata getMetadata(Timeout timeout) throws AnalysisException, TimeoutException {
        try {
            TimeoutRegister.set(timeout);
            TimeoutRegister.get().check();
            DocumentMetadata metadata = this.extractor.getMetadata();
            TimeoutRegister.remove();
            return metadata;
        } catch (Throwable th) {
            TimeoutRegister.remove();
            throw th;
        }
    }

    public DocumentMetadata getMetadata() throws AnalysisException, TimeoutException {
        return getMetadata(this.mainTimeout);
    }

    public DocumentMetadata getMetadata(long j) throws AnalysisException, TimeoutException {
        return getMetadata(combineWithMainTimeout(j));
    }

    private Element getMetadataAsNLM(Timeout timeout) throws AnalysisException, TimeoutException {
        try {
            TimeoutRegister.set(timeout);
            TimeoutRegister.get().check();
            Element metadataAsNLM = this.extractor.getMetadataAsNLM();
            TimeoutRegister.remove();
            return metadataAsNLM;
        } catch (Throwable th) {
            TimeoutRegister.remove();
            throw th;
        }
    }

    public Element getMetadataAsNLM() throws AnalysisException, TimeoutException {
        return getMetadataAsNLM(this.mainTimeout);
    }

    public Element getMetadataAsNLM(long j) throws AnalysisException, TimeoutException {
        return getMetadataAsNLM(combineWithMainTimeout(j));
    }

    private List<BibEntry> getReferences(Timeout timeout) throws AnalysisException, TimeoutException {
        try {
            TimeoutRegister.set(timeout);
            TimeoutRegister.get().check();
            List<BibEntry> references = this.extractor.getReferences();
            TimeoutRegister.remove();
            return references;
        } catch (Throwable th) {
            TimeoutRegister.remove();
            throw th;
        }
    }

    public List<BibEntry> getReferences() throws AnalysisException, TimeoutException {
        return getReferences(this.mainTimeout);
    }

    public List<BibEntry> getReferences(long j) throws AnalysisException, TimeoutException {
        return getReferences(combineWithMainTimeout(j));
    }

    private List<Element> getReferencesAsNLM(Timeout timeout) throws AnalysisException, TimeoutException {
        try {
            TimeoutRegister.set(timeout);
            TimeoutRegister.get().check();
            List<Element> referencesAsNLM = this.extractor.getReferencesAsNLM();
            TimeoutRegister.remove();
            return referencesAsNLM;
        } catch (Throwable th) {
            TimeoutRegister.remove();
            throw th;
        }
    }

    public List<Element> getReferencesAsNLM() throws AnalysisException, TimeoutException {
        return getReferencesAsNLM(this.mainTimeout);
    }

    public List<Element> getReferencesAsNLM(long j) throws AnalysisException, TimeoutException {
        return getReferencesAsNLM(combineWithMainTimeout(j));
    }

    private String getRawFullText(Timeout timeout) throws AnalysisException, TimeoutException {
        try {
            TimeoutRegister.set(timeout);
            TimeoutRegister.get().check();
            String rawFullText = this.extractor.getRawFullText();
            TimeoutRegister.remove();
            return rawFullText;
        } catch (Throwable th) {
            TimeoutRegister.remove();
            throw th;
        }
    }

    public String getRawFullText() throws AnalysisException, TimeoutException {
        return getRawFullText(this.mainTimeout);
    }

    public String getRawFullText(long j) throws AnalysisException, TimeoutException {
        return getRawFullText(combineWithMainTimeout(j));
    }

    private Element getLabelledFullText(Timeout timeout) throws AnalysisException, TimeoutException {
        try {
            TimeoutRegister.set(timeout);
            TimeoutRegister.get().check();
            Element labelledFullText = this.extractor.getLabelledFullText();
            TimeoutRegister.remove();
            return labelledFullText;
        } catch (Throwable th) {
            TimeoutRegister.remove();
            throw th;
        }
    }

    public Element getLabelledFullText() throws AnalysisException, TimeoutException {
        return getLabelledFullText(this.mainTimeout);
    }

    public Element getLabelledFullText(long j) throws AnalysisException, TimeoutException {
        return getLabelledFullText(combineWithMainTimeout(j));
    }

    private ContentStructure getBody(Timeout timeout) throws AnalysisException, TimeoutException {
        try {
            TimeoutRegister.set(timeout);
            TimeoutRegister.get().check();
            ContentStructure body = this.extractor.getBody();
            TimeoutRegister.remove();
            return body;
        } catch (Throwable th) {
            TimeoutRegister.remove();
            throw th;
        }
    }

    public ContentStructure getBody() throws AnalysisException, TimeoutException {
        return getBody(this.mainTimeout);
    }

    public ContentStructure getBody(long j) throws AnalysisException, TimeoutException {
        return getBody(combineWithMainTimeout(j));
    }

    private Element getBodyAsNLM(Timeout timeout) throws AnalysisException, TimeoutException {
        try {
            TimeoutRegister.set(timeout);
            TimeoutRegister.get().check();
            Element bodyAsNLM = this.extractor.getBodyAsNLM();
            TimeoutRegister.remove();
            return bodyAsNLM;
        } catch (Throwable th) {
            TimeoutRegister.remove();
            throw th;
        }
    }

    public Element getBodyAsNLM() throws AnalysisException, TimeoutException {
        return getBodyAsNLM(this.mainTimeout);
    }

    public Element getBodyAsNLM(long j) throws AnalysisException, TimeoutException {
        return getBodyAsNLM(combineWithMainTimeout(j));
    }

    private Element getContentAsNLM(Timeout timeout) throws AnalysisException, TimeoutException {
        try {
            TimeoutRegister.set(timeout);
            TimeoutRegister.get().check();
            Element contentAsNLM = this.extractor.getContentAsNLM();
            TimeoutRegister.remove();
            return contentAsNLM;
        } catch (Throwable th) {
            TimeoutRegister.remove();
            throw th;
        }
    }

    public Element getContentAsNLM() throws AnalysisException, TimeoutException {
        return getContentAsNLM(this.mainTimeout);
    }

    public Element getContentAsNLM(long j) throws AnalysisException, TimeoutException {
        return getContentAsNLM(combineWithMainTimeout(j));
    }

    private Timeout combineWithMainTimeout(long j) {
        return Timeout.min(this.mainTimeout, new Timeout(j * 1000));
    }

    public static void main(String[] strArr) throws ParseException, AnalysisException, IOException, TransformationException {
        long currentTimeMillis;
        CommandLineOptionsParser commandLineOptionsParser = new CommandLineOptionsParser();
        String parse = commandLineOptionsParser.parse(strArr);
        if (parse != null) {
            System.err.println(parse + "\n");
            System.err.println("Usage: ContentExtractor -path <path> [optional parameters]\n\nTool for extracting metadata and content from PDF files.\n\nArguments:\n  -path <path>           path to a directory containing PDF files\n  -outputs <list>        (optional) comma-separated list of extraction\n                         output(s); possible values: \"jats\" (document\n                         metadata and content in NLM JATS format), \"text\"\n                         (raw document text), \"zones\" (text zones with\n                         their labels), \"trueviz\" (geometric structure in\n                         TrueViz format); default: \"jats\"\n  -exts <list>           (optional) comma-separated list of extensions of the\n                         resulting files; the list has to have the same\n                         length as output list; default: \"cermxml\"\n  -override              override already existing files\n  -timeout <seconds>     (optional) approximate maximum allowed processing\n                         time for a PDF file in seconds; by default, no\n                         timeout is used; the value is approximate because in\n                         some cases, the program might be allowed to slightly\n                         exceeded this time, say by a second or two\n  -configuration <path>\t(optional) path to configuration properties file\n                         see https://github.com/CeON/CERMINE\n                         for description of available configuration properties\n  -threads <num>         (optional) number of threads for parallel processing;\n                         default: 3");
            System.exit(1);
        }
        InternalContentExtractor.THREADS_NUMBER = commandLineOptionsParser.getThreadsNumber();
        boolean override = commandLineOptionsParser.override();
        Long timeout = commandLineOptionsParser.getTimeout();
        String path = commandLineOptionsParser.getPath();
        Map<String, String> typesAndExtensions = commandLineOptionsParser.getTypesAndExtensions();
        Collection<File> listFiles = FileUtils.listFiles(new File(path), new String[]{PdfSchema.DEFAULT_XPATH_ID}, true);
        ContentExtractorConfigLoader contentExtractorConfigLoader = new ContentExtractorConfigLoader();
        ContentExtractorConfig loadConfiguration = commandLineOptionsParser.getConfigurationPath() == null ? contentExtractorConfigLoader.loadConfiguration() : contentExtractorConfigLoader.loadConfiguration(commandLineOptionsParser.getConfigurationPath());
        int i = 0;
        for (File file : listFiles) {
            HashMap hashMap = new HashMap();
            for (Map.Entry<String, String> entry : typesAndExtensions.entrySet()) {
                File outputFile = getOutputFile(file, entry.getValue());
                if (override || !outputFile.exists()) {
                    hashMap.put(entry.getKey(), outputFile);
                }
            }
            if (hashMap.isEmpty()) {
                i++;
            } else {
                long currentTimeMillis2 = System.currentTimeMillis();
                System.out.println("File processed: " + file.getPath());
                ContentExtractor contentExtractor = null;
                try {
                    try {
                        try {
                            contentExtractor = createContentExtractor(loadConfiguration, timeout);
                            contentExtractor.setPDF(new FileInputStream(file));
                            if (hashMap.containsKey("jats")) {
                                FileUtils.writeStringToFile((File) hashMap.get("jats"), new XMLOutputter(Format.getPrettyFormat()).outputString(contentExtractor.getContentAsNLM()), "UTF-8");
                            }
                            if (hashMap.containsKey("trueviz")) {
                                new BxDocumentToTrueVizWriter().write(new FileWriter((File) hashMap.get("trueviz")), Lists.newArrayList(contentExtractor.getBxDocumentWithSpecificLabels()), new Object[0]);
                            }
                            if (hashMap.containsKey("zones")) {
                                FileUtils.writeStringToFile((File) hashMap.get("zones"), new XMLOutputter(Format.getPrettyFormat()).outputString(contentExtractor.getLabelledFullText()), "UTF-8");
                            }
                            if (hashMap.containsKey("text")) {
                                FileUtils.writeStringToFile((File) hashMap.get("text"), contentExtractor.getRawFullText(), "UTF-8");
                            }
                            if (contentExtractor != null) {
                                contentExtractor.removeTimeout();
                            }
                            currentTimeMillis = System.currentTimeMillis();
                        } catch (AnalysisException e) {
                            printException(e);
                            if (contentExtractor != null) {
                                contentExtractor.removeTimeout();
                            }
                            currentTimeMillis = System.currentTimeMillis();
                        }
                    } catch (TransformationException e2) {
                        printException(e2);
                        if (contentExtractor != null) {
                            contentExtractor.removeTimeout();
                        }
                        currentTimeMillis = System.currentTimeMillis();
                    } catch (TimeoutException e3) {
                        printException(e3);
                        if (contentExtractor != null) {
                            contentExtractor.removeTimeout();
                        }
                        currentTimeMillis = System.currentTimeMillis();
                    }
                    float f = ((float) (currentTimeMillis - currentTimeMillis2)) / 1000.0f;
                    i++;
                    int size = (i * 100) / listFiles.size();
                    System.out.println("Extraction time: " + Math.round(f) + HtmlTags.S);
                    System.out.println("Progress: " + size + "% done (" + i + " out of " + listFiles.size() + DefaultExpressionEngine.DEFAULT_INDEX_END);
                    System.out.println("");
                } catch (Throwable th) {
                    if (contentExtractor != null) {
                        contentExtractor.removeTimeout();
                    }
                    float currentTimeMillis3 = ((float) (System.currentTimeMillis() - currentTimeMillis2)) / 1000.0f;
                    throw th;
                }
            }
        }
    }

    private static ContentExtractor createContentExtractor(ContentExtractorConfig contentExtractorConfig, Long l) throws TimeoutException, AnalysisException {
        ContentExtractor contentExtractor = l != null ? new ContentExtractor(contentExtractorConfig, l.longValue()) : new ContentExtractor(contentExtractorConfig);
        TimeoutRegister.get().check();
        return contentExtractor;
    }

    private static File getOutputFile(File file, String str) {
        return new File(file.getPath().replaceFirst("pdf$", str));
    }

    private static void printException(Exception exc) {
        System.out.print("Exception occured: " + ExceptionUtils.getStackTrace(exc));
    }
}
