package eu.eudml.enhancement.pdf2text.node;

import eu.eudml.EudmlConstants;
import eu.eudml.enhancement.pdf.FullTextProcessingAbstractNode;
import eu.eudml.processing.message.EnhancerProcessMessage;
import eu.eudml.processing.node.util.DescribedStorageContent;
import eu.eudml.service.process.StoredContentPart;
import eu.eudml.service.storage.ContentFileHandle;
import java.io.File;
import java.io.IOException;
import java.io.StringWriter;
import java.util.List;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.edu.icm.yadda.process.ctx.ProcessContext;
import pl.edu.icm.yadda.service2.exception.ServiceException;

/* loaded from: input_file:WEB-INF/lib/eudml-processing-2.0.4-SNAPSHOT.jar:eu/eudml/enhancement/pdf2text/node/PlainTextExtractorNode.class */
public final class PlainTextExtractorNode extends FullTextProcessingAbstractNode {
    private static final Logger log = LoggerFactory.getLogger(PlainTextExtractorNode.class);

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // eu.eudml.enhancement.pdf.FullTextProcessingAbstractNode, pl.edu.icm.yadda.process.node.IProcessingNode
    public EnhancerProcessMessage process(EnhancerProcessMessage enhancerProcessMessage, ProcessContext processContext) throws Exception {
        List<DescribedStorageContent<ContentFileHandle>> savedProcessableParts = getSavedProcessableParts(enhancerProcessMessage, EudmlConstants.SOURCE_EUDML_CONTENT_PDF_PART);
        return isProcessable(savedProcessableParts) ? processInternal(enhancerProcessMessage, savedProcessableParts) : enhancerProcessMessage;
    }

    protected boolean isProcessable(List<DescribedStorageContent<ContentFileHandle>> list) {
        return (list == null || list.isEmpty()) ? false : true;
    }

    protected EnhancerProcessMessage processInternal(EnhancerProcessMessage enhancerProcessMessage, List<DescribedStorageContent<ContentFileHandle>> list) throws Exception {
        String id = enhancerProcessMessage.getId();
        StoredContentPart.StoredContentPartBuilder storedContentPartBuilder = new StoredContentPart.StoredContentPartBuilder(id, EudmlConstants.EXTRACTED_CONTENT_TXT_PART, "text/plain");
        storedContentPartBuilder.enableAutoResolveFileNames();
        StoredContentPart.StoredContentPartBuilder storedContentPartBuilder2 = new StoredContentPart.StoredContentPartBuilder(id, EudmlConstants.CONTENT_AUXILIARY_PART, EudmlConstants.XML_TEXT_MIME_TYPE);
        storedContentPartBuilder2.enableAutoResolveFileNames();
        int i = 0;
        for (DescribedStorageContent<ContentFileHandle> describedStorageContent : list) {
            if (describedStorageContent != null && describedStorageContent.getContent() != null) {
                try {
                    long currentTimeMillis = System.currentTimeMillis();
                    String extractTextFromPdf = extractTextFromPdf(describedStorageContent.getContent().getFile());
                    log.trace("STATS#{}#{};{}", id, Integer.valueOf(i), Long.valueOf(System.currentTimeMillis() - currentTimeMillis));
                    if (isGarbage(extractTextFromPdf)) {
                        log.debug("Extracted text for doc with id: {}#{} was considered as garbage.", id, Integer.valueOf(i));
                    } else {
                        storedContentPartBuilder.addContent("", extractTextFromPdf.getBytes("UTF-8"), describedStorageContent.getSpecificUses());
                        storedContentPartBuilder2.addContent("", extractTextFromPdf.getBytes("UTF-8"), describedStorageContent.getSpecificUses());
                        log.debug("Adding plain text of {}#{}", id, Integer.valueOf(i));
                    }
                    i++;
                } catch (Exception e) {
                    log.trace("STATS#{}#{};EXCEPTION:{}", id, Integer.valueOf(i), e.getMessage());
                    return enhancerProcessMessage;
                }
            }
        }
        StoredContentPart build = storedContentPartBuilder.build();
        StoredContentPart build2 = storedContentPartBuilder2.build();
        if (build2 != null) {
            enhancerProcessMessage.addContentPart(build);
            enhancerProcessMessage.addContentPart(build2);
        }
        return enhancerProcessMessage;
    }

    private String extractTextFromPdf(File file) throws IOException, InvalidPasswordException, CryptographyException, ServiceException {
        StringWriter stringWriter = new StringWriter();
        PDFTextStripper pDFTextStripper = new PDFTextStripper("UTF-8");
        PDDocument load = PDDocument.load(file);
        try {
            if (load.isEncrypted()) {
                load.decrypt("");
            }
            pDFTextStripper.writeText(load, stringWriter);
            String sanitizeText = sanitizeText(stringWriter.toString());
            load.close();
            stringWriter.close();
            return sanitizeText;
        } catch (Throwable th) {
            load.close();
            stringWriter.close();
            throw th;
        }
    }
}
