package pl.edu.icm.yadda.analysis.articlecontent;

import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipFile;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import pl.edu.icm.yadda.analysis.AnalysisException;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.DigitDotSchemaFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.DigitParSchemaFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.DoubleDigitSchemaFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.HeightFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.IndentationFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.IsHeigherThanNeighborsFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.LengthFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.LowercaseSchemaFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.NextLineIndentationFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.PrevSpaceFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.RomanDigitsSchemaFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.TripleDigitSchemaFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.UppercaseSchemaFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.WordsAllUppercaseFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.line.WordsUppercaseFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.zone.AreaFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.zone.FigureTableFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.zone.GreekLettersFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.zone.MathSymbolsFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.zone.RelativeMeanLengthFeature;
import pl.edu.icm.yadda.analysis.articlecontent.features.zone.XVarianceFeature;
import pl.edu.icm.yadda.analysis.articlecontent.model.DocumentContentStructure;
import pl.edu.icm.yadda.analysis.classification.features.FeatureVectorBuilder;
import pl.edu.icm.yadda.analysis.classification.features.SimpleFeatureVectorBuilder;
import pl.edu.icm.yadda.analysis.classification.knn.model.KnnModel;
import pl.edu.icm.yadda.analysis.textr.model.BxDocument;
import pl.edu.icm.yadda.analysis.textr.model.BxLine;
import pl.edu.icm.yadda.analysis.textr.model.BxPage;
import pl.edu.icm.yadda.analysis.textr.model.BxZone;
import pl.edu.icm.yadda.analysis.textr.model.BxZoneLabel;
import pl.edu.icm.yadda.analysis.textr.readingorder.ReadingOrderAnalyzer;
import pl.edu.icm.yadda.analysis.textr.transformers.TrueVizToBxDocumentReader;
import pl.edu.icm.yadda.metadata.transformers.TransformationException;

/* loaded from: input_file:pl/edu/icm/yadda/analysis/articlecontent/LogicalStructureExtractorDemo.class */
public class LogicalStructureExtractorDemo {
    String dir = "/pl/edu/icm/yadda/analysis/articlecontent/";
    String trainZip = "train.zip";
    String testZip = "test.zip";
    String sourceDir = "source/";
    String structureDir = "structure/";
    List<BxDocument> trainDocuments = new ArrayList();
    List<DocumentContentStructure> trainHeaderStructures = new ArrayList();
    List<BxDocument> testDocuments = new ArrayList();
    List<DocumentContentStructure> testHeaderStructures = new ArrayList();
    FeatureVectorBuilder<BxLine, BxPage> classVectorBuilder;
    FeatureVectorBuilder<BxLine, BxPage> clustVectorBuilder;
    FeatureVectorBuilder<BxZone, BxPage> junkVectorBuilder;

    public void setUp() throws IOException, TransformationException, AnalysisException, URISyntaxException, JDOMException {
        this.classVectorBuilder = new SimpleFeatureVectorBuilder();
        this.classVectorBuilder.setFeatureCalculators(Arrays.asList(new DigitDotSchemaFeature(), new DigitParSchemaFeature(), new DoubleDigitSchemaFeature(), new HeightFeature(), new IndentationFeature(), new IsHeigherThanNeighborsFeature(), new LengthFeature(), new LowercaseSchemaFeature(), new NextLineIndentationFeature(), new PrevSpaceFeature(), new RomanDigitsSchemaFeature(), new TripleDigitSchemaFeature(), new UppercaseSchemaFeature(), new WordsAllUppercaseFeature(), new WordsUppercaseFeature()));
        this.clustVectorBuilder = new SimpleFeatureVectorBuilder();
        this.clustVectorBuilder.setFeatureCalculators(Arrays.asList(new DigitDotSchemaFeature(), new DigitParSchemaFeature(), new DoubleDigitSchemaFeature(), new LowercaseSchemaFeature(), new RomanDigitsSchemaFeature(), new TripleDigitSchemaFeature(), new UppercaseSchemaFeature()));
        this.junkVectorBuilder = new SimpleFeatureVectorBuilder();
        this.junkVectorBuilder.setFeatureCalculators(Arrays.asList(new AreaFeature(), new FigureTableFeature(), new GreekLettersFeature(), new RelativeMeanLengthFeature(), new MathSymbolsFeature(), new XVarianceFeature()));
        ZipFile zipFile = new ZipFile(new File(getClass().getResource(this.dir + this.trainZip).toURI()));
        fillLists(zipFile, getEntries(zipFile), this.trainDocuments, this.trainHeaderStructures);
        ZipFile zipFile2 = new ZipFile(new File(getClass().getResource(this.dir + this.testZip).toURI()));
        fillLists(zipFile2, getEntries(zipFile2), this.testDocuments, this.testHeaderStructures);
    }

    public void test() throws IOException, TransformationException, AnalysisException, URISyntaxException {
        LogicalStructureExtractor logicalStructureExtractor = new LogicalStructureExtractor();
        KnnModel<BxZoneLabel> buildModel = new ContentHeaderExtractor().buildModel(this.classVectorBuilder, this.trainDocuments, this.trainHeaderStructures);
        KnnModel<BxZoneLabel> buildModel2 = new ContentJunkFilter().buildModel(this.junkVectorBuilder, this.trainDocuments);
        int i = 0;
        int i2 = 0;
        int i3 = 0;
        for (int i4 = 0; i4 < this.testDocuments.size(); i4++) {
            BxDocument bxDocument = this.testDocuments.get(i4);
            System.out.println("");
            System.out.println(i4);
            DocumentContentStructure documentContentStructure = this.testHeaderStructures.get(i4);
            i += documentContentStructure.getHeaderCount();
            System.out.println();
            System.out.println("ORIGINAL: ");
            documentContentStructure.printHeaders();
            DocumentContentStructure extractStructure = logicalStructureExtractor.extractStructure(buildModel2, buildModel, this.junkVectorBuilder, this.classVectorBuilder, this.clustVectorBuilder, bxDocument);
            System.out.println("EXTRACTED:");
            extractStructure.printHeaders();
            i3 += extractStructure.getHeaderCount();
            for (String str : documentContentStructure.getHeaderTexts()) {
                if (extractStructure.containsHeaderText(str)) {
                    i2++;
                } else {
                    System.out.println("NOT EXTR: " + str);
                }
            }
        }
        System.out.println("Header Precission: " + ((i2 / i3) * 100.0d) + "%");
        System.out.println("Header Recall: " + ((i2 / i) * 100.0d) + "%");
    }

    private List<ZipEntry> getEntries(ZipFile zipFile) throws URISyntaxException, ZipException, IOException {
        ArrayList arrayList = new ArrayList();
        Enumeration<? extends ZipEntry> entries = zipFile.entries();
        while (entries.hasMoreElements()) {
            ZipEntry nextElement = entries.nextElement();
            if (nextElement.getName().endsWith(".xml")) {
                arrayList.add(nextElement);
            }
        }
        Collections.sort(arrayList, new Comparator<ZipEntry>() { // from class: pl.edu.icm.yadda.analysis.articlecontent.LogicalStructureExtractorDemo.1
            @Override // java.util.Comparator
            public int compare(ZipEntry zipEntry, ZipEntry zipEntry2) {
                return zipEntry.getName().compareTo(zipEntry2.getName());
            }
        });
        return arrayList;
    }

    private void fillLists(ZipFile zipFile, List<ZipEntry> list, List<BxDocument> list2, List<DocumentContentStructure> list3) throws IOException, TransformationException, JDOMException {
        ReadingOrderAnalyzer readingOrderAnalyzer = new ReadingOrderAnalyzer();
        for (ZipEntry zipEntry : list) {
            if (zipEntry.getName().matches("^.*/" + this.sourceDir + ".*$")) {
                list2.add(readingOrderAnalyzer.setReadingOrder(new BxDocument().setPages(new TrueVizToBxDocumentReader().read(new InputStreamReader(zipFile.getInputStream(zipEntry)), new Object[0]))));
            }
            if (zipEntry.getName().matches("^.*/" + this.structureDir + ".*$")) {
                List<Element> children = new SAXBuilder("org.apache.xerces.parsers.SAXParser").build(new InputStreamReader(zipFile.getInputStream(zipEntry))).getRootElement().getChildren();
                DocumentContentStructure documentContentStructure = new DocumentContentStructure();
                documentContentStructure.build(children);
                list3.add(documentContentStructure);
            }
        }
    }

    public static void main(String[] strArr) throws IOException, TransformationException, AnalysisException, URISyntaxException, JDOMException {
        LogicalStructureExtractorDemo logicalStructureExtractorDemo = new LogicalStructureExtractorDemo();
        logicalStructureExtractorDemo.setUp();
        logicalStructureExtractorDemo.test();
    }
}
