package pl.edu.icm.cermine.evaluation;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPathExpressionException;
import org.apache.commons.lang.StringUtils;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.jdom.output.DOMOutputter;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
import pl.edu.icm.cermine.PdfNLMMetadataExtractor;
import pl.edu.icm.cermine.bibref.model.BibEntry;
import pl.edu.icm.cermine.evaluation.tools.CosineDistance;
import pl.edu.icm.cermine.evaluation.tools.DateComparator;
import pl.edu.icm.cermine.evaluation.tools.PdfNlmIterator;
import pl.edu.icm.cermine.evaluation.tools.PdfNlmPair;
import pl.edu.icm.cermine.evaluation.tools.SmithWatermanDistance;
import pl.edu.icm.cermine.evaluation.tools.StringTools;
import pl.edu.icm.cermine.evaluation.tools.XMLTools;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.1.jar:pl/edu/icm/cermine/evaluation/FinalMetadataExtractionEvaluation.class */
public final class FinalMetadataExtractionEvaluation {
    private Boolean verbose;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:WEB-INF/lib/cermine-impl-1.1.jar:pl/edu/icm/cermine/evaluation/FinalMetadataExtractionEvaluation$CorrectAllPair.class */
    public static class CorrectAllPair {
        public Integer correct = 0;
        public Integer all = 0;

        public String toString() {
            return "[Correct: " + this.correct + ", all: " + this.all + "]";
        }

        public Double calculateAccuracy() {
            if (this.all.intValue() == 0) {
                return null;
            }
            return Double.valueOf(this.correct.intValue() / this.all.intValue());
        }
    }

    public FinalMetadataExtractionEvaluation(Boolean bool) {
        this.verbose = false;
        this.verbose = bool;
    }

    private void printVerbose(String str) {
        if (this.verbose.booleanValue()) {
            System.out.println(str);
        }
    }

    public void evaluate(PdfNlmIterator pdfNlmIterator) throws AnalysisException, IOException, TransformationException, ParserConfigurationException, SAXException, JDOMException, XPathExpressionException, TransformerException {
        PdfNLMMetadataExtractor pdfNLMMetadataExtractor = new PdfNLMMetadataExtractor();
        DocumentBuilderFactory newInstance = DocumentBuilderFactory.newInstance();
        newInstance.setValidating(false);
        newInstance.setFeature("http://xml.org/sax/features/namespaces", false);
        newInstance.setFeature("http://xml.org/sax/features/validation", false);
        newInstance.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
        newInstance.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
        DocumentBuilder documentBuilder = null;
        try {
            documentBuilder = newInstance.newDocumentBuilder();
        } catch (ParserConfigurationException e) {
            e.printStackTrace();
            System.exit(1);
        }
        SAXBuilder sAXBuilder = new SAXBuilder("org.apache.xerces.parsers.SAXParser");
        sAXBuilder.setValidation(false);
        sAXBuilder.setFeature("http://xml.org/sax/features/validation", false);
        sAXBuilder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
        sAXBuilder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
        CorrectAllPair correctAllPair = new CorrectAllPair();
        CorrectAllPair correctAllPair2 = new CorrectAllPair();
        CorrectAllPair correctAllPair3 = new CorrectAllPair();
        CorrectAllPair correctAllPair4 = new CorrectAllPair();
        CorrectAllPair correctAllPair5 = new CorrectAllPair();
        CorrectAllPair correctAllPair6 = new CorrectAllPair();
        CorrectAllPair correctAllPair7 = new CorrectAllPair();
        CorrectAllPair correctAllPair8 = new CorrectAllPair();
        ArrayList arrayList = new ArrayList(pdfNlmIterator.size().intValue());
        ArrayList arrayList2 = new ArrayList(pdfNlmIterator.size().intValue());
        ArrayList arrayList3 = new ArrayList(pdfNlmIterator.size().intValue());
        ArrayList arrayList4 = new ArrayList(pdfNlmIterator.size().intValue());
        ArrayList arrayList5 = new ArrayList(pdfNlmIterator.size().intValue());
        ArrayList arrayList6 = new ArrayList(pdfNlmIterator.size().intValue());
        ArrayList arrayList7 = new ArrayList(pdfNlmIterator.size().intValue());
        ArrayList arrayList8 = new ArrayList(pdfNlmIterator.size().intValue());
        Iterator<PdfNlmPair> it = pdfNlmIterator.iterator();
        while (it.hasNext()) {
            PdfNlmPair next = it.next();
            printVerbose(">>>>>>>>> " + next.getPdf().getName());
            Document parse = documentBuilder.parse(new FileInputStream(next.getNlm()));
            Document ElementToW3CDocument = ElementToW3CDocument(pdfNLMMetadataExtractor.extractMetadata((InputStream) new FileInputStream(next.getPdf())));
            String extractTextFromNode = XMLTools.extractTextFromNode(parse, "/article/front/article-meta/title-group/article-title");
            String extractTextFromNode2 = XMLTools.extractTextFromNode(ElementToW3CDocument, "/article/front/article-meta/title-group/article-title");
            List<String> extractTextAsList = XMLTools.extractTextAsList(parse, "/article/front/article-meta/contrib-group/contrib//name");
            List<String> extractTextAsList2 = XMLTools.extractTextAsList(ElementToW3CDocument, "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']/string-name");
            List<String> extractTextAsList3 = XMLTools.extractTextAsList(parse, "/article/front/article-meta/kwd-group/kwd");
            List<String> extractTextAsList4 = XMLTools.extractTextAsList(ElementToW3CDocument, "/article/front/article-meta/kwd-group/kwd");
            String extractTextFromNode3 = XMLTools.extractTextFromNode(parse, "/article/front/journal-meta/journal-title-group/journal-title");
            String extractTextFromNode4 = XMLTools.extractTextFromNode(ElementToW3CDocument, "/article/front/journal-meta/journal-title-group/journal-title");
            String extractTextFromNode5 = XMLTools.extractTextFromNode(parse, "/article/front/journal-meta/publisher/publisher-name");
            String extractTextFromNode6 = XMLTools.extractTextFromNode(ElementToW3CDocument, "/article/front/journal-meta/publisher/publisher-name");
            String extractTextFromNode7 = XMLTools.extractTextFromNode(parse, "/article/front/article-meta/abstract");
            String extractTextFromNode8 = XMLTools.extractTextFromNode(ElementToW3CDocument, "/article/front/article-meta/abstract");
            String extractTextFromNode9 = XMLTools.extractTextFromNode(parse, "/article/front/article-meta/article-id[@pub-id-type='doi']");
            String extractTextFromNode10 = XMLTools.extractTextFromNode(ElementToW3CDocument, "/article/front/article-meta/article-id[@pub-id-type='doi']");
            String extractTextFromNode11 = XMLTools.extractTextFromNode(parse, "/article/front/journal-meta/issn[@pub-type='ppub']");
            String extractTextFromNode12 = XMLTools.extractTextFromNode(ElementToW3CDocument, "/article/front/journal-meta/issn[@pub-type='ppub']");
            String extractTextFromNode13 = XMLTools.extractTextFromNode(parse, "/article/front/article-meta/article-id[@pub-id-type='urn']");
            String extractTextFromNode14 = XMLTools.extractTextFromNode(ElementToW3CDocument, "/article/front/article-meta/article-id[@pub-id-type='urn']");
            String extractTextFromNode15 = XMLTools.extractTextFromNode(parse, "/article/front/article-meta/volume");
            String extractTextFromNode16 = XMLTools.extractTextFromNode(ElementToW3CDocument, "/article/front/article-meta/volume");
            String extractTextFromNode17 = XMLTools.extractTextFromNode(parse, "/article/front/article-meta/issue");
            String extractTextFromNode18 = XMLTools.extractTextFromNode(ElementToW3CDocument, "/article/front/article-meta/issue");
            String extractTextFromNode19 = XMLTools.extractTextFromNode(parse, "/article/front/article-meta/fpage");
            String extractTextFromNode20 = XMLTools.extractTextFromNode(ElementToW3CDocument, "/article/front/article-meta/fpage");
            String extractTextFromNode21 = XMLTools.extractTextFromNode(parse, "/article/front/article-meta/lpage");
            String extractTextFromNode22 = XMLTools.extractTextFromNode(ElementToW3CDocument, "/article/front/article-meta/lpage");
            List<String> removeLeadingZerosFromDate = removeLeadingZerosFromDate(XMLTools.extractTextAsList(parse, "/article/front/article-meta/pub-date"));
            List<String> removeLeadingZerosFromDate2 = removeLeadingZerosFromDate(XMLTools.extractTextAsList(ElementToW3CDocument, "/article/front/article-meta/pub-date"));
            if (!extractTextFromNode15.isEmpty()) {
                if (extractTextFromNode15.equals(extractTextFromNode16)) {
                    correctAllPair4.correct = Integer.valueOf(correctAllPair4.correct.intValue() + 1);
                }
                correctAllPair4.all = Integer.valueOf(correctAllPair4.all.intValue() + 1);
            }
            if (!extractTextFromNode17.isEmpty()) {
                if (extractTextFromNode17.equals(extractTextFromNode18)) {
                    correctAllPair5.correct = Integer.valueOf(correctAllPair5.correct.intValue() + 1);
                }
                correctAllPair5.all = Integer.valueOf(correctAllPair5.all.intValue() + 1);
            }
            if (!extractTextFromNode11.isEmpty()) {
                if (extractTextFromNode12.equals(extractTextFromNode11)) {
                    correctAllPair.correct = Integer.valueOf(correctAllPair.correct.intValue() + 1);
                }
                correctAllPair.all = Integer.valueOf(correctAllPair.all.intValue() + 1);
            }
            if (!extractTextFromNode9.isEmpty()) {
                if (extractTextFromNode9.equals(extractTextFromNode10)) {
                    correctAllPair2.correct = Integer.valueOf(correctAllPair2.correct.intValue() + 1);
                }
                correctAllPair2.all = Integer.valueOf(correctAllPair2.all.intValue() + 1);
            }
            if (!extractTextFromNode13.isEmpty()) {
                if (extractTextFromNode13.equals(extractTextFromNode14)) {
                    correctAllPair3.correct = Integer.valueOf(correctAllPair3.correct.intValue() + 1);
                }
                correctAllPair3.all = Integer.valueOf(correctAllPair3.all.intValue() + 1);
            }
            if (!extractTextFromNode19.isEmpty() && !extractTextFromNode21.isEmpty()) {
                if (extractTextFromNode19.equals(extractTextFromNode20) && extractTextFromNode21.equals(extractTextFromNode22)) {
                    correctAllPair6.correct = Integer.valueOf(correctAllPair6.correct.intValue() + 1);
                }
                correctAllPair6.all = Integer.valueOf(correctAllPair6.all.intValue() + 1);
            }
            if (!removeLeadingZerosFromDate.isEmpty()) {
                Boolean yearsMatch = DateComparator.yearsMatch(removeLeadingZerosFromDate, removeLeadingZerosFromDate2);
                if (yearsMatch != null) {
                    if (yearsMatch.booleanValue()) {
                        correctAllPair7.correct = Integer.valueOf(correctAllPair7.correct.intValue() + 1);
                    }
                    correctAllPair7.all = Integer.valueOf(correctAllPair7.all.intValue() + 1);
                }
                Boolean datesMatch = DateComparator.datesMatch(removeLeadingZerosFromDate, removeLeadingZerosFromDate2);
                if (datesMatch != null) {
                    if (datesMatch.booleanValue()) {
                        correctAllPair8.correct = Integer.valueOf(correctAllPair8.correct.intValue() + 1);
                    }
                    correctAllPair8.all = Integer.valueOf(correctAllPair8.all.intValue() + 1);
                }
            }
            if (extractTextFromNode5.length() > 0) {
                arrayList4.add(compareStringsSW(extractTextFromNode5, extractTextFromNode6));
            } else {
                arrayList4.add(null);
            }
            if (extractTextFromNode7.length() > 0) {
                arrayList.add(compareStringsSW(extractTextFromNode7, extractTextFromNode8));
            } else {
                arrayList.add(null);
            }
            if (extractTextFromNode.length() > 0) {
                arrayList2.add(compareStringsSW(extractTextFromNode, extractTextFromNode2));
            } else {
                arrayList2.add(null);
            }
            if (extractTextFromNode3.length() > 0) {
                arrayList3.add(compareStringsSW(extractTextFromNode, extractTextFromNode2));
            } else {
                arrayList3.add(null);
            }
            if (extractTextAsList.size() > 0) {
                arrayList7.add(calculatePrecision(extractTextAsList, extractTextAsList2));
                arrayList8.add(calculateRecall(extractTextAsList, extractTextAsList2));
            } else {
                arrayList7.add(null);
                arrayList8.add(null);
            }
            if (extractTextAsList3.size() > 0) {
                arrayList5.add(calculatePrecision(extractTextAsList3, extractTextAsList4));
                arrayList6.add(calculateRecall(extractTextAsList3, extractTextAsList4));
            } else {
                arrayList5.add(null);
                arrayList6.add(null);
            }
            printVerbose(">>> Expected authors: ");
            Iterator<String> it2 = extractTextAsList.iterator();
            while (it2.hasNext()) {
                printVerbose(it2.next());
            }
            printVerbose(">>> Extracted authors: ");
            Iterator<String> it3 = extractTextAsList2.iterator();
            while (it3.hasNext()) {
                printVerbose(it3.next());
            }
            printVerbose(">>> Expected keywords: ");
            Iterator<String> it4 = extractTextAsList3.iterator();
            while (it4.hasNext()) {
                printVerbose(it4.next());
            }
            printVerbose(">>> Extracted keywords: ");
            Iterator<String> it5 = extractTextAsList4.iterator();
            while (it5.hasNext()) {
                printVerbose(it5.next());
            }
            printVerbose(">>> Expected journal title: " + extractTextFromNode3);
            printVerbose(">>> Extracted journal title: " + extractTextFromNode4);
            printVerbose(">>> Expected publisher name: " + extractTextFromNode5);
            printVerbose(">>> Extracted publisher name: " + extractTextFromNode6);
            printVerbose(">>> Expected article title: " + extractTextFromNode);
            printVerbose(">>> Extracted article title: " + extractTextFromNode2);
            printVerbose(">>> Expected article abstract: " + extractTextFromNode7);
            printVerbose(">>> Extracted article abstract: " + extractTextFromNode8);
            printVerbose(">>> Expected doi: " + extractTextFromNode9);
            printVerbose(">>> Extracted doi: " + extractTextFromNode10);
            printVerbose(">>> Expected date: ");
            Iterator<String> it6 = removeLeadingZerosFromDate.iterator();
            while (it6.hasNext()) {
                printVerbose(it6.next());
            }
            printVerbose(">>> Extracted date: ");
            Iterator<String> it7 = removeLeadingZerosFromDate2.iterator();
            while (it7.hasNext()) {
                printVerbose(it7.next());
            }
            printVerbose("abstract " + arrayList);
            printVerbose("title " + arrayList2);
            printVerbose("journal title " + arrayList3);
            printVerbose("publisher name rates " + arrayList4);
            printVerbose("namesP " + arrayList7);
            printVerbose("namesR " + arrayList8);
            printVerbose("keywordsP " + arrayList5);
            printVerbose("keywordsR " + arrayList6);
            printVerbose("date years" + correctAllPair7);
            printVerbose("date full" + correctAllPair8);
            printVerbose(BibEntry.FIELD_DOI + correctAllPair2);
            printVerbose("URN" + correctAllPair3);
            printVerbose("pages" + correctAllPair6);
        }
        System.out.println("==== Summary (" + pdfNlmIterator.size() + " docs)====");
        Double calculateAverage = calculateAverage(arrayList);
        if (calculateAverage != null) {
            System.out.printf("abstract avg (SW) \t\t%4.2f\n", Double.valueOf(100.0d * calculateAverage.doubleValue()));
        }
        Double calculateAverage2 = calculateAverage(arrayList2);
        if (calculateAverage2 != null) {
            System.out.printf("title avg (SW) \t\t\t%4.2f\n", Double.valueOf(100.0d * calculateAverage2.doubleValue()));
        }
        Double calculateAverage3 = calculateAverage(arrayList3);
        if (calculateAverage3 != null) {
            System.out.printf("journal title avg (SW) \t\t%4.2f\n", Double.valueOf(100.0d * calculateAverage3.doubleValue()));
        }
        Double calculateAverage4 = calculateAverage(arrayList4);
        if (calculateAverage4 != null) {
            System.out.printf("publisher name (SW) \t\t%4.2f\n", Double.valueOf(100.0d * calculateAverage4.doubleValue()));
        }
        Double calculateAverage5 = calculateAverage(arrayList7);
        if (calculateAverage5 != null) {
            System.out.printf("names precision avg (EQ)\t%4.2f\n", Double.valueOf(100.0d * calculateAverage5.doubleValue()));
        }
        Double calculateAverage6 = calculateAverage(arrayList8);
        if (calculateAverage6 != null) {
            System.out.printf("names recall avg (EQ)\t\t%4.2f\n", Double.valueOf(100.0d * calculateAverage6.doubleValue()));
        }
        Double calculateAverage7 = calculateAverage(arrayList5);
        if (calculateAverage7 != null) {
            System.out.printf("keywords precision avg (EQ)\t%4.2f\n", Double.valueOf(100.0d * calculateAverage7.doubleValue()));
        }
        Double calculateAverage8 = calculateAverage(arrayList6);
        if (calculateAverage8 != null) {
            System.out.printf("keywords recall avg (EQ)\t%4.2f\n", Double.valueOf(100.0d * calculateAverage8.doubleValue()));
        }
        Double calculateAccuracy = correctAllPair7.calculateAccuracy();
        if (calculateAccuracy != null) {
            System.out.printf("date year accuracy avg\t\t%4.2f\n", Double.valueOf(100.0d * calculateAccuracy.doubleValue()));
        }
        Double calculateAccuracy2 = correctAllPair8.calculateAccuracy();
        if (calculateAccuracy2 != null) {
            System.out.printf("date full accuracy avg\t\t%4.2f\n", Double.valueOf(100.0d * calculateAccuracy2.doubleValue()));
        }
        Double calculateAccuracy3 = correctAllPair2.calculateAccuracy();
        if (calculateAccuracy3 != null) {
            System.out.printf("doi accuracy avg\t\t%4.2f\n", Double.valueOf(100.0d * calculateAccuracy3.doubleValue()));
        }
        Double calculateAccuracy4 = correctAllPair3.calculateAccuracy();
        if (calculateAccuracy4 != null) {
            System.out.printf("URN accuracy avg\t\t%4.2f\n", Double.valueOf(100.0d * calculateAccuracy4.doubleValue()));
        }
        Double calculateAccuracy5 = correctAllPair6.calculateAccuracy();
        if (calculateAccuracy5 != null) {
            System.out.printf("pages accuracy avg\t\t%4.2f\n", Double.valueOf(100.0d * calculateAccuracy5.doubleValue()));
        }
    }

    public static void main(String[] strArr) throws AnalysisException, IOException, TransformationException, ParserConfigurationException, SAXException, JDOMException, XPathExpressionException, TransformerException {
        String str;
        if (strArr.length < 1 || strArr.length > 2) {
            System.out.println("Usage: FinalEffectEvaluator [-v] <input dir>");
            return;
        }
        Boolean bool = false;
        if (strArr[0].equals("-v")) {
            bool = true;
            str = strArr[1];
        } else {
            str = strArr[0];
        }
        new FinalMetadataExtractionEvaluation(bool).evaluate(new PdfNlmIterator(str));
    }

    private static Double calculateAverage(List<Double> list) {
        Integer num = 0;
        Double valueOf = Double.valueOf(0.0d);
        for (Double d : list) {
            if (d != null) {
                num = Integer.valueOf(num.intValue() + 1);
                valueOf = Double.valueOf(valueOf.doubleValue() + d.doubleValue());
            }
        }
        return Double.valueOf(valueOf.doubleValue() / num.intValue());
    }

    private static Double calculatePrecision(List<String> list, List<String> list2) {
        if (list2.size() == 0) {
            return Double.valueOf(0.0d);
        }
        Integer num = 0;
        CosineDistance cosineDistance = new CosineDistance();
        for (String str : list2) {
            Iterator<String> it = list.iterator();
            while (true) {
                if (it.hasNext()) {
                    if (cosineDistance.compare(StringTools.tokenize(str), StringTools.tokenize(it.next())).doubleValue() > Math.sqrt(2.0d) / 2.0d) {
                        num = Integer.valueOf(num.intValue() + 1);
                        break;
                    }
                }
            }
        }
        return Double.valueOf(num.intValue() / list2.size());
    }

    private static Double calculateRecall(List<String> list, List<String> list2) {
        Integer num = 0;
        CosineDistance cosineDistance = new CosineDistance();
        for (String str : list2) {
            Iterator<String> it = list.iterator();
            while (true) {
                if (it.hasNext()) {
                    if (cosineDistance.compare(StringTools.tokenize(str), StringTools.tokenize(it.next())).doubleValue() > Math.sqrt(2.0d) / 2.0d) {
                        num = Integer.valueOf(num.intValue() + 1);
                        break;
                    }
                }
            }
        }
        return Double.valueOf(num.intValue() / list.size());
    }

    private static Double compareStringsSW(String str, String str2) {
        return Double.valueOf(new SmithWatermanDistance(Double.valueOf(0.0d), Double.valueOf(0.0d)).compare(StringTools.tokenize(str), StringTools.tokenize(str2)).doubleValue() / r0.size());
    }

    static List<String> removeLeadingZerosFromDate(List<String> list) {
        ArrayList arrayList = new ArrayList();
        for (String str : list) {
            String[] split = str.split("\\s");
            if (split.length > 1) {
                ArrayList arrayList2 = new ArrayList();
                for (String str2 : split) {
                    arrayList2.add(str2.replaceFirst("^0+(?!$)", ""));
                }
                arrayList.add(StringUtils.join(arrayList2, " "));
            } else {
                arrayList.add(str);
            }
        }
        return arrayList;
    }

    static Document ElementToW3CDocument(Element element) throws JDOMException {
        org.jdom.Document document = new org.jdom.Document();
        document.setRootElement(element);
        return new DOMOutputter().output(document);
    }

    static String outputDoc(Document document) throws IOException, TransformerException {
        OutputFormat outputFormat = new OutputFormat(document);
        outputFormat.setLineWidth(65);
        outputFormat.setIndenting(true);
        outputFormat.setIndent(2);
        StringWriter stringWriter = new StringWriter();
        new XMLSerializer(stringWriter, outputFormat).serialize(document);
        return stringWriter.toString();
    }
}
