package eu.eudml.enhancement.pdf2text.node;

import eu.eudml.enhancement.bibref.EnhancementUtils;
import eu.eudml.enhancement.pdf.PdfExtractorAbstractNode;
import eu.eudml.processing.message.EnhancerProcessMessage;
import eu.eudml.service.process.StoredContentPart;
import eu.eudml.service.storage.ContentPart;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.edu.icm.yadda.process.ctx.ProcessContext;
import pl.edu.icm.yadda.service2.exception.ServiceException;

/* loaded from: input_file:eu/eudml/enhancement/pdf2text/node/PlainTextExtractorNode.class */
public final class PlainTextExtractorNode extends PdfExtractorAbstractNode {
    private static final Logger log = LoggerFactory.getLogger(PlainTextExtractorNode.class);

    private String extractTextFromPdf(byte[] bArr) throws IOException, InvalidPasswordException, CryptographyException, ServiceException {
        StringWriter stringWriter = new StringWriter();
        PDFTextStripper pDFTextStripper = new PDFTextStripper("UTF-8");
        PDDocument load = PDDocument.load(new ByteArrayInputStream(bArr));
        try {
            if (load.isEncrypted()) {
                load.decrypt("");
            }
            pDFTextStripper.writeText(load, stringWriter);
            String sanitizeText = sanitizeText(stringWriter.toString());
            load.close();
            stringWriter.close();
            return sanitizeText;
        } catch (Throwable th) {
            load.close();
            stringWriter.close();
            throw th;
        }
    }

    @Override // eu.eudml.enhancement.pdf.PdfExtractorAbstractNode
    public EnhancerProcessMessage process(EnhancerProcessMessage enhancerProcessMessage, ProcessContext processContext) throws Exception {
        if (enhancerProcessMessage.getSourceRecord() == null) {
            throw new NullPointerException();
        }
        ArrayList arrayList = new ArrayList();
        Iterator<StoredContentPart> it = enhancerProcessMessage.getAddedContentParts().iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().getMeta().getPartId());
        }
        if (arrayList.contains("enhanced/text")) {
            return enhancerProcessMessage;
        }
        String id = enhancerProcessMessage.getId();
        byte[] content = EnhancementUtils.getContent(this.storage, enhancerProcessMessage, "content/raw_content/pdf");
        if (content == null) {
            return enhancerProcessMessage;
        }
        try {
            long currentTimeMillis = System.currentTimeMillis();
            String extractTextFromPdf = extractTextFromPdf(content);
            log.trace("STATS#{};{}", new Object[]{id, Long.valueOf(System.currentTimeMillis() - currentTimeMillis)});
            if (isGarbage(extractTextFromPdf)) {
                log.debug("Extracted text for doc with id: {} was considered as garbage.", id);
            } else {
                byte[] bytes = extractTextFromPdf.getBytes("UTF-8");
                ContentPart contentPart = new ContentPart(id, "enhanced/text", ContentPart.ContentPartType.PLAINTEXT_INDEX, bytes.length, "text/plain", id + ".txt", new Date());
                log.debug("Adding plain text of " + id);
                enhancerProcessMessage.addContentPart(contentPart, bytes);
            }
            return enhancerProcessMessage;
        } catch (Exception e) {
            log.trace("STATS#{};EXCEPTION:{}", new Object[]{id, e.getMessage()});
            return enhancerProcessMessage;
        }
    }
}
