package pl.edu.icm.cermine;

import com.itextpdf.text.html.HtmlTags;
import com.itextpdf.text.pdf.ColumnText;
import com.itextpdf.text.xml.xmp.PdfSchema;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.httpclient.cookie.Cookie2;
import org.apache.commons.io.FileUtils;
import org.jdom.Content;
import org.jdom.Element;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import pl.edu.icm.cermine.bibref.model.BibEntry;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.structure.SVMAlternativeMetadataZoneClassifier;
import pl.edu.icm.cermine.structure.model.BxDocument;
import pl.edu.icm.cermine.structure.transformers.BxDocumentToTrueVizWriter;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.4-SNAPSHOT.jar:pl/edu/icm/cermine/PdfNLMContentExtractor.class */
public class PdfNLMContentExtractor {
    public static int THREADS_NUMBER = 3;
    private boolean extractMetadata = true;
    private boolean extractReferences = true;
    private boolean extractText = true;
    private ComponentConfiguration conf = new ComponentConfiguration();

    public Element extractContent(InputStream inputStream) throws AnalysisException {
        return extractContent(ExtractionUtils.extractStructure(this.conf, inputStream));
    }

    public Element extractContent(BxDocument bxDocument) throws AnalysisException {
        Element element = new Element(BibEntry.TYPE_ARTICLE);
        Content element2 = new Element("front");
        if (this.extractMetadata) {
            element2 = (Element) ExtractionUtils.extractMetadataAsNLM(this.conf, bxDocument).getChild("front").clone();
        }
        element.addContent(element2);
        Content element3 = new Element(HtmlTags.BODY);
        if (this.extractText) {
            element3 = ExtractionUtils.extractTextAsNLM(this.conf, bxDocument);
        }
        element.addContent(element3);
        Element element4 = new Element("back");
        Element element5 = new Element("ref-list");
        if (this.extractReferences) {
            for (Element element6 : ExtractionUtils.extractReferencesAsNLM(this.conf, bxDocument)) {
                Element element7 = new Element("ref");
                element7.addContent(element6);
                element5.addContent(element7);
            }
        }
        element4.addContent(element5);
        element.addContent(element4);
        return element;
    }

    public ComponentConfiguration getConf() {
        return this.conf;
    }

    public void setConf(ComponentConfiguration componentConfiguration) {
        this.conf = componentConfiguration;
    }

    public boolean isExtractMetadata() {
        return this.extractMetadata;
    }

    public void setExtractMetadata(boolean z) {
        this.extractMetadata = z;
    }

    public boolean isExtractReferences() {
        return this.extractReferences;
    }

    public void setExtractReferences(boolean z) {
        this.extractReferences = z;
    }

    public boolean isExtractText() {
        return this.extractText;
    }

    public void setExtractText(boolean z) {
        this.extractText = z;
    }

    public static void main(String[] strArr) throws ParseException, IOException {
        Options options = new Options();
        options.addOption(Cookie2.PATH, true, "file or directory path");
        options.addOption("ext", true, "metadata file extension");
        options.addOption("str", false, "store structure (TrueViz) files as well");
        options.addOption("strext", true, "structure file extension");
        options.addOption("modelmeta", true, "path to metadata classifier model");
        options.addOption("modelinit", true, "path to initial classifier model");
        options.addOption("threads", true, "number of threads used");
        CommandLine parse = new GnuParser().parse(options, strArr);
        String optionValue = parse.getOptionValue(Cookie2.PATH);
        String optionValue2 = parse.hasOption("ext") ? parse.getOptionValue("ext") : "cermxml";
        boolean hasOption = parse.hasOption("str");
        String optionValue3 = parse.hasOption("strext") ? parse.getOptionValue("strext") : "cxml";
        String str = null;
        String str2 = null;
        if (parse.hasOption("modelmeta")) {
            str = parse.getOptionValue("modelmeta");
            str2 = parse.getOptionValue("modelmeta") + ".range";
        }
        String str3 = null;
        String str4 = null;
        if (parse.hasOption("modelinit")) {
            str3 = parse.getOptionValue("modelinit");
            str4 = parse.getOptionValue("modelinit") + ".range";
        }
        if (parse.hasOption("threads")) {
            THREADS_NUMBER = Integer.valueOf(parse.getOptionValue("threads")).intValue();
        }
        if (optionValue == null) {
            System.err.println("Usage: PdfNLMContentExtractor -path <path> [optional parameters]\n\nTool for extracting metadata and content from PDF files.\n\nArguments:\n  -path <path>              path to a PDF file or directory containing PDF files\n  -ext <extension>          (optional) the extension of the resulting metadata file;\n                            default: \"cermxml\"; used only if passed path is a directory\n  -modelmeta <path>         (optional) the path to the metadata classifier model file\n  -modelinit <path>         (optional) the path to the initial classifier model file\n  -str                      whether to store structure (TrueViz) files as well;\n                            used only if passed path is a directory\n  -strext <extension>       (optional) the extension of the structure (TrueViz) file;\n                            default: \"cxml\"; used only if passed path is a directory\n  -threads <num>            number of threads for parallel processing\n");
            System.exit(1);
        }
        File file = new File(optionValue);
        if (file.isFile()) {
            try {
                PdfNLMContentExtractor pdfNLMContentExtractor = new PdfNLMContentExtractor();
                if ("alt-humanities".equals(str)) {
                    pdfNLMContentExtractor.getConf().setMetadataZoneClassifier(SVMAlternativeMetadataZoneClassifier.getDefaultInstance());
                } else if (str != null) {
                    pdfNLMContentExtractor.getConf().setMetadataZoneClassifier(new FileInputStream(str), new FileInputStream(str2));
                }
                if (str3 != null) {
                    pdfNLMContentExtractor.getConf().setInitialZoneClassifier(new FileInputStream(str3), new FileInputStream(str4));
                }
                System.out.println(new XMLOutputter(Format.getPrettyFormat()).outputString(pdfNLMContentExtractor.extractContent(new FileInputStream(file))));
                return;
            } catch (AnalysisException e) {
                e.printStackTrace();
                return;
            }
        }
        Collection<File> listFiles = FileUtils.listFiles(file, new String[]{PdfSchema.DEFAULT_XPATH_ID}, true);
        int i = 0;
        for (File file2 : listFiles) {
            File file3 = new File(file2.getPath().replaceAll("pdf$", optionValue2));
            if (file3.exists()) {
                i++;
            } else {
                long currentTimeMillis = System.currentTimeMillis();
                float f = 0.0f;
                System.out.println(file2.getPath());
                try {
                    PdfNLMContentExtractor pdfNLMContentExtractor2 = new PdfNLMContentExtractor();
                    if ("alt-humanities".equals(str)) {
                        pdfNLMContentExtractor2.getConf().setMetadataZoneClassifier(SVMAlternativeMetadataZoneClassifier.getDefaultInstance());
                    } else if (str != null) {
                        pdfNLMContentExtractor2.getConf().setMetadataZoneClassifier(new FileInputStream(str), new FileInputStream(str2));
                    }
                    if (str3 != null) {
                        pdfNLMContentExtractor2.getConf().setInitialZoneClassifier(new FileInputStream(str3), new FileInputStream(str4));
                    }
                    BxDocument extractStructure = ExtractionUtils.extractStructure(pdfNLMContentExtractor2.getConf(), new FileInputStream(file2));
                    Element extractContent = pdfNLMContentExtractor2.extractContent(extractStructure);
                    f = ((float) (System.currentTimeMillis() - currentTimeMillis)) / 1000.0f;
                    XMLOutputter xMLOutputter = new XMLOutputter(Format.getPrettyFormat());
                    if (!file3.createNewFile()) {
                        System.out.println("Cannot create new file!");
                    }
                    FileUtils.writeStringToFile(file3, xMLOutputter.outputString(extractContent));
                    if (hasOption) {
                        new BxDocumentToTrueVizWriter().write(new FileWriter(new File(file2.getPath().replaceAll("pdf$", optionValue3))), extractStructure.getPages(), new Object[0]);
                    }
                } catch (AnalysisException e2) {
                    e2.printStackTrace();
                } catch (TransformationException e3) {
                    e3.printStackTrace();
                }
                i++;
                int size = (i * 100) / listFiles.size();
                if (f == ColumnText.GLOBAL_SPACE_CHAR_RATIO) {
                    f = ((float) (System.currentTimeMillis() - currentTimeMillis)) / 1000.0f;
                }
                System.out.println("Extraction time: " + Math.round(f) + "s");
                System.out.println(size + "% done (" + i + " out of " + listFiles.size() + ")");
                System.out.println("");
            }
        }
    }
}
