package pl.edu.icm.cermine;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.structure.CharacterExtractor;
import pl.edu.icm.cermine.structure.DocumentSegmenter;
import pl.edu.icm.cermine.structure.HierarchicalReadingOrderResolver;
import pl.edu.icm.cermine.structure.ITextCharacterExtractor;
import pl.edu.icm.cermine.structure.ParallelDocstrumSegmenter;
import pl.edu.icm.cermine.structure.ReadingOrderResolver;
import pl.edu.icm.cermine.structure.SVMInitialZoneClassifier;
import pl.edu.icm.cermine.structure.ZoneClassifier;
import pl.edu.icm.cermine.structure.model.BxDocument;
import pl.edu.icm.cermine.structure.tools.BxModelUtils;
import pl.edu.icm.cermine.structure.transformers.BxDocumentToTrueVizWriter;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.1-SNAPSHOT.jar:pl/edu/icm/cermine/PdfBxStructureExtractor.class */
public class PdfBxStructureExtractor implements DocumentStructureExtractor {
    private CharacterExtractor characterExtractor;
    private DocumentSegmenter documentSegmenter;
    private ReadingOrderResolver roResolver;
    private ZoneClassifier initialClassifier;

    public PdfBxStructureExtractor() throws AnalysisException {
        try {
            this.characterExtractor = new ITextCharacterExtractor();
            this.documentSegmenter = new ParallelDocstrumSegmenter();
            this.roResolver = new HierarchicalReadingOrderResolver();
            this.initialClassifier = new SVMInitialZoneClassifier();
        } catch (IOException e) {
            throw new AnalysisException("Cannot create PdfBxStructureExtractor!", e);
        }
    }

    public PdfBxStructureExtractor(BufferedReader bufferedReader, BufferedReader bufferedReader2) throws AnalysisException {
        try {
            this.characterExtractor = new ITextCharacterExtractor();
            this.documentSegmenter = new ParallelDocstrumSegmenter();
            this.roResolver = new HierarchicalReadingOrderResolver();
            this.initialClassifier = new SVMInitialZoneClassifier(bufferedReader, bufferedReader2);
        } catch (IOException e) {
            throw new AnalysisException("Cannot create PdfBxStructureExtractor!", e);
        }
    }

    public PdfBxStructureExtractor(CharacterExtractor characterExtractor, DocumentSegmenter documentSegmenter, ReadingOrderResolver readingOrderResolver, ZoneClassifier zoneClassifier) {
        this.characterExtractor = characterExtractor;
        this.documentSegmenter = documentSegmenter;
        this.roResolver = readingOrderResolver;
        this.initialClassifier = zoneClassifier;
    }

    @Override // pl.edu.icm.cermine.DocumentStructureExtractor
    public BxDocument extractStructure(InputStream inputStream) throws AnalysisException {
        BxDocument segmentDocument = this.documentSegmenter.segmentDocument(this.characterExtractor.extractCharacters(inputStream));
        BxModelUtils.setParents(segmentDocument);
        return this.initialClassifier.classifyZones(this.roResolver.resolve(segmentDocument));
    }

    public void setGlyphExtractor(CharacterExtractor characterExtractor) {
        this.characterExtractor = characterExtractor;
    }

    public void setInitialClassifier(ZoneClassifier zoneClassifier) {
        this.initialClassifier = zoneClassifier;
    }

    public void setPageSegmenter(DocumentSegmenter documentSegmenter) {
        this.documentSegmenter = documentSegmenter;
    }

    public void setRoResolver(ReadingOrderResolver readingOrderResolver) {
        this.roResolver = readingOrderResolver;
    }

    public static void main(String[] strArr) throws AnalysisException, IOException, TransformationException {
        if (strArr.length != 1) {
            System.err.println("USAGE: program DIR_PATH");
            System.exit(1);
        }
        PdfBxStructureExtractor pdfBxStructureExtractor = new PdfBxStructureExtractor();
        for (File file : new File(strArr[0]).listFiles()) {
            BxDocument extractStructure = pdfBxStructureExtractor.extractStructure(new FileInputStream(file));
            BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(file.getName() + ".xml"));
            try {
                BxDocumentToTrueVizWriter bxDocumentToTrueVizWriter = new BxDocumentToTrueVizWriter();
                bufferedWriter.write(bxDocumentToTrueVizWriter.write(extractStructure.getPages(), new Object[0]));
                bxDocumentToTrueVizWriter.write(extractStructure.getPages(), new Object[0]);
                bufferedWriter.close();
            } catch (Throwable th) {
                bufferedWriter.close();
                throw th;
            }
        }
    }
}
