package pl.edu.icm.cermine.structure;

import com.itextpdf.text.Rectangle;
import com.itextpdf.text.exceptions.InvalidPdfException;
import com.itextpdf.text.pdf.PRIndirectReference;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.ContentByteUtils;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.PdfContentStreamProcessor;
import com.itextpdf.text.pdf.parser.RenderListener;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.structure.model.BxBounds;
import pl.edu.icm.cermine.structure.model.BxChunk;
import pl.edu.icm.cermine.structure.model.BxDocument;
import pl.edu.icm.cermine.structure.model.BxPage;
import pl.edu.icm.cermine.structure.tools.BxBoundsBuilder;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.5-SNAPSHOT.jar:pl/edu/icm/cermine/structure/ITextCharacterExtractor.class */
public class ITextCharacterExtractor implements CharacterExtractor {
    public static final int DEFAULT_FRONT_PAGES_LIMIT = 20;
    public static final int DEFAULT_BACK_PAGES_LIMIT = 20;
    private int frontPagesLimit = 20;
    private int backPagesLimit = 20;
    private static final int PAGE_GRID_SIZE = 10;
    private static final int CHUNK_DENSITY_LIMIT = 15;
    protected static final Map<String, PdfName> ALT_TO_STANDART_FONTS = new HashMap();

    /* loaded from: input_file:WEB-INF/lib/cermine-impl-1.5-SNAPSHOT.jar:pl/edu/icm/cermine/structure/ITextCharacterExtractor$BxDocumentCreator.class */
    static class BxDocumentCreator implements RenderListener {
        private BxPage actPage;
        private Rectangle pageRectangle;
        private BxDocument document = new BxDocument();
        private BxBoundsBuilder boundsBuilder = new BxBoundsBuilder();

        BxDocumentCreator() {
        }

        /* JADX INFO: Access modifiers changed from: private */
        public void processNewBxPage(Rectangle rectangle) {
            if (this.actPage != null) {
                this.actPage.setBounds(this.boundsBuilder.getBounds());
                this.boundsBuilder.clear();
            }
            this.actPage = new BxPage();
            this.document.addPage(this.actPage);
            this.pageRectangle = rectangle;
        }

        @Override // com.itextpdf.text.pdf.parser.RenderListener
        public void beginTextBlock() {
        }

        @Override // com.itextpdf.text.pdf.parser.RenderListener
        public void renderText(TextRenderInfo textRenderInfo) {
            for (TextRenderInfo textRenderInfo2 : textRenderInfo.getCharacterRenderInfos()) {
                String text = textRenderInfo2.getText();
                if (textRenderInfo2.getText().charAt(0) > ' ' && !text.matches("^[�-�]$") && !text.matches("^[�-�]$") && !text.matches("^[\ufff0-\uffff]$")) {
                    float f = textRenderInfo2.getDescentLine().getStartPoint().get(0);
                    float f2 = textRenderInfo2.getDescentLine().getStartPoint().get(1);
                    float left = f - this.pageRectangle.getLeft();
                    float bottom = f2 - this.pageRectangle.getBottom();
                    float f3 = textRenderInfo2.getAscentLine().getStartPoint().get(1) - textRenderInfo2.getDescentLine().getStartPoint().get(1);
                    float length = textRenderInfo2.getDescentLine().getLength();
                    if (Float.isNaN(f3) || Float.isInfinite(f3)) {
                        f3 = 0.0f;
                    }
                    if (Float.isNaN(length) || Float.isInfinite(length)) {
                        length = 0.0f;
                    }
                    if (f >= this.pageRectangle.getLeft() && f + length <= this.pageRectangle.getRight() && f2 >= this.pageRectangle.getBottom() && f2 + f3 <= this.pageRectangle.getTop()) {
                        BxBounds bxBounds = new BxBounds(left, (this.pageRectangle.getHeight() - bottom) - f3, length, f3);
                        if (!Double.isNaN(bxBounds.getX()) && !Double.isInfinite(bxBounds.getX()) && !Double.isNaN(bxBounds.getY()) && !Double.isInfinite(bxBounds.getY()) && !Double.isNaN(bxBounds.getHeight()) && !Double.isInfinite(bxBounds.getHeight()) && !Double.isNaN(bxBounds.getWidth()) && !Double.isInfinite(bxBounds.getWidth())) {
                            BxChunk bxChunk = new BxChunk(bxBounds, text);
                            bxChunk.setFontName(textRenderInfo.getFont().getFullFontName()[0][3]);
                            this.actPage.addChunk(bxChunk);
                            this.boundsBuilder.expand(bxBounds);
                        }
                    }
                }
            }
        }

        @Override // com.itextpdf.text.pdf.parser.RenderListener
        public void endTextBlock() {
        }

        @Override // com.itextpdf.text.pdf.parser.RenderListener
        public void renderImage(ImageRenderInfo imageRenderInfo) {
        }
    }

    @Override // pl.edu.icm.cermine.structure.CharacterExtractor
    public BxDocument extractCharacters(InputStream inputStream) throws AnalysisException {
        try {
            BxDocumentCreator bxDocumentCreator = new BxDocumentCreator();
            PdfReader pdfReader = new PdfReader(inputStream);
            PdfContentStreamProcessor pdfContentStreamProcessor = new PdfContentStreamProcessor(bxDocumentCreator);
            for (int i = 1; i <= pdfReader.getNumberOfPages(); i++) {
                if (this.frontPagesLimit <= 0 || this.backPagesLimit <= 0 || i <= this.frontPagesLimit || i >= (pdfReader.getNumberOfPages() - 1) - this.backPagesLimit) {
                    bxDocumentCreator.processNewBxPage(pdfReader.getPageSize(i));
                    PdfDictionary asDict = pdfReader.getPageN(i).getAsDict(PdfName.RESOURCES);
                    processAlternativeFontNames(asDict);
                    pdfContentStreamProcessor.reset();
                    pdfContentStreamProcessor.processContent(ContentByteUtils.getContentBytesForPage(pdfReader, i), asDict);
                }
            }
            return filterComponents(removeDuplicateChunks(bxDocumentCreator.document));
        } catch (InvalidPdfException e) {
            throw new AnalysisException("Invalid PDF file", e);
        } catch (IOException e2) {
            throw new AnalysisException("Cannot extract characters from PDF file", e2);
        }
    }

    private void processAlternativeFontNames(PdfDictionary pdfDictionary) {
        PdfDictionary asDict = pdfDictionary.getAsDict(PdfName.FONT);
        if (asDict == null) {
            return;
        }
        for (PdfName pdfName : asDict.getKeys()) {
            if (!(asDict.get(pdfName) instanceof PRIndirectReference)) {
                return;
            }
            PdfDictionary pdfDictionary2 = (PdfDictionary) PdfReader.getPdfObjectRelease((PRIndirectReference) asDict.get(pdfName));
            PdfName asName = pdfDictionary2.getAsName(PdfName.BASEFONT);
            if (asName != null) {
                String decodeName = PdfName.decodeName(asName.toString());
                if (pdfDictionary2.getAsArray(PdfName.WIDTHS) == null && ALT_TO_STANDART_FONTS.containsKey(decodeName)) {
                    pdfDictionary2.put(PdfName.BASEFONT, ALT_TO_STANDART_FONTS.get(decodeName));
                }
            }
        }
    }

    private BxDocument removeDuplicateChunks(BxDocument bxDocument) {
        for (BxPage bxPage : bxDocument.getPages()) {
            List<BxChunk> chunks = bxPage.getChunks();
            ArrayList arrayList = new ArrayList();
            HashMap hashMap = new HashMap();
            for (BxChunk bxChunk : chunks) {
                int x = (int) bxChunk.getX();
                int y = (int) bxChunk.getY();
                boolean z = false;
                int i = x - 1;
                while (true) {
                    if (i > x + 1) {
                        break;
                    }
                    for (int i2 = y - 1; i2 <= y + 1; i2++) {
                        if (hashMap.get(Integer.valueOf(i)) != null && ((Map) hashMap.get(Integer.valueOf(i))).get(Integer.valueOf(i2)) != null) {
                            for (BxChunk bxChunk2 : (Set) ((Map) hashMap.get(Integer.valueOf(i))).get(Integer.valueOf(i2))) {
                                if (bxChunk.toText().equals(bxChunk2.toText()) && bxChunk.getBounds().isSimilarTo(bxChunk2.getBounds(), 1.0d)) {
                                    z = true;
                                    break;
                                }
                            }
                        }
                    }
                    i++;
                }
                if (!z) {
                    arrayList.add(bxChunk);
                    int x2 = (int) bxChunk.getX();
                    int y2 = (int) bxChunk.getY();
                    if (hashMap.get(Integer.valueOf(x2)) == null) {
                        hashMap.put(Integer.valueOf(x2), new HashMap());
                    }
                    if (((Map) hashMap.get(Integer.valueOf(x2))).get(Integer.valueOf(y2)) == null) {
                        ((Map) hashMap.get(Integer.valueOf(x2))).put(Integer.valueOf(y2), new HashSet());
                    }
                    ((Set) ((Map) hashMap.get(Integer.valueOf(x2))).get(Integer.valueOf(y2))).add(bxChunk);
                }
            }
            bxPage.setChunks(arrayList);
        }
        return bxDocument;
    }

    private BxDocument filterComponents(BxDocument bxDocument) {
        for (BxPage bxPage : bxDocument.getPages()) {
            BxBoundsBuilder bxBoundsBuilder = new BxBoundsBuilder();
            Iterator<BxChunk> it = bxPage.getChunks().iterator();
            while (it.hasNext()) {
                bxBoundsBuilder.expand(it.next().getBounds());
            }
            double size = (100.0d * bxPage.getChunks().size()) / (bxBoundsBuilder.getBounds().getWidth() * bxBoundsBuilder.getBounds().getHeight());
            if (!Double.isNaN(size) && size >= 15.0d) {
                HashMap hashMap = new HashMap();
                for (BxChunk bxChunk : bxPage.getChunks()) {
                    String str = Integer.toString(((int) bxChunk.getX()) / 10) + " " + Integer.toString(((int) bxChunk.getY()) / 10);
                    if (hashMap.get(str) == null) {
                        hashMap.put(str, new ArrayList());
                    }
                    ((List) hashMap.get(str)).add(bxChunk);
                }
                for (List list : hashMap.values()) {
                    if (list.size() > 15) {
                        Iterator it2 = list.iterator();
                        while (it2.hasNext()) {
                            bxPage.getChunks().remove((BxChunk) it2.next());
                        }
                    }
                }
            }
        }
        return bxDocument;
    }

    public int getBackPagesLimit() {
        return this.backPagesLimit;
    }

    public int getFrontPagesLimit() {
        return this.frontPagesLimit;
    }

    public void setPagesLimits(int i, int i2) {
        this.frontPagesLimit = i;
        this.backPagesLimit = i2;
    }

    static {
        ALT_TO_STANDART_FONTS.put("CourierNew", PdfName.COURIER);
        ALT_TO_STANDART_FONTS.put("CourierNew,Bold", PdfName.COURIER_BOLD);
        ALT_TO_STANDART_FONTS.put("CourierNew,BoldItalic", PdfName.COURIER_BOLDOBLIQUE);
        ALT_TO_STANDART_FONTS.put("CourierNew,Italic", PdfName.COURIER_OBLIQUE);
        ALT_TO_STANDART_FONTS.put("Arial", PdfName.HELVETICA);
        ALT_TO_STANDART_FONTS.put("Arial,Bold", PdfName.HELVETICA_BOLD);
        ALT_TO_STANDART_FONTS.put("Arial,BoldItalic", PdfName.HELVETICA_BOLDOBLIQUE);
        ALT_TO_STANDART_FONTS.put("Arial,Italic", PdfName.HELVETICA_OBLIQUE);
        ALT_TO_STANDART_FONTS.put("TimesNewRoman", PdfName.TIMES_ROMAN);
        ALT_TO_STANDART_FONTS.put("TimesNewRoman,Bold", PdfName.TIMES_BOLD);
        ALT_TO_STANDART_FONTS.put("TimesNewRoman,BoldItalic", PdfName.TIMES_BOLDITALIC);
        ALT_TO_STANDART_FONTS.put("TimesNewRoman,Italic", PdfName.TIMES_ITALIC);
    }
}
