package eu.eudml.enhancement.pdf2textviaocr.node;

import com.google.common.io.Files;
import eu.eudml.EudmlConstants;
import eu.eudml.enhancement.pdf.FullTextProcessingAbstractNode;
import eu.eudml.enhancement.pdf2textviaocr.PdfImageExtractor;
import eu.eudml.enhancement.pdf2textviaocr.Tesseract;
import eu.eudml.enhancement.pdf2textviaocr.TesseractLanguage;
import eu.eudml.enhancement.pdf2textviaocr.Tools;
import eu.eudml.processing.message.EnhancerProcessMessage;
import eu.eudml.processing.node.util.DescribedStorageContent;
import eu.eudml.service.process.StoredContentPart;
import eu.eudml.service.storage.ContentFileHandle;
import eu.eudml.util.nlm.NlmConstants;
import eu.eudml.util.nlm.NlmProcessorHelper;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Required;
import pl.edu.icm.yadda.process.ctx.ProcessContext;

/* loaded from: input_file:WEB-INF/lib/eudml-processing-2.0.6-SNAPSHOT.jar:eu/eudml/enhancement/pdf2textviaocr/node/ImageTextExtractorNode.class */
public class ImageTextExtractorNode extends FullTextProcessingAbstractNode {
    private static final Logger log = LoggerFactory.getLogger(ImageTextExtractorNode.class);
    private static final String NEW_LINE = System.getProperty("line.separator");
    private Tesseract tesseract;

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // eu.eudml.enhancement.pdf.FullTextProcessingAbstractNode, pl.edu.icm.yadda.process.node.IProcessingNode
    public EnhancerProcessMessage process(EnhancerProcessMessage enhancerProcessMessage, ProcessContext processContext) throws Exception {
        String id = enhancerProcessMessage.getId();
        if (enhancerProcessMessage.getAddedContentPart(EudmlConstants.EXTRACTED_CONTENT_TXT_PART) != null) {
            return enhancerProcessMessage;
        }
        List<DescribedStorageContent<ContentFileHandle>> savedProcessableParts = getSavedProcessableParts(enhancerProcessMessage, EudmlConstants.SOURCE_EUDML_CONTENT_PDF_PART);
        if (savedProcessableParts == null || savedProcessableParts.isEmpty()) {
            return enhancerProcessMessage;
        }
        StoredContentPart.StoredContentPartBuilder storedContentPartBuilder = new StoredContentPart.StoredContentPartBuilder(id, EudmlConstants.EXTRACTED_CONTENT_TXT_PART, "text/plain");
        storedContentPartBuilder.enableAutoResolveFileNames();
        StoredContentPart.StoredContentPartBuilder storedContentPartBuilder2 = new StoredContentPart.StoredContentPartBuilder(id, EudmlConstants.CONTENT_AUXILIARY_PART, EudmlConstants.XML_TEXT_MIME_TYPE);
        storedContentPartBuilder2.enableAutoResolveFileNames();
        int i = 0;
        for (DescribedStorageContent<ContentFileHandle> describedStorageContent : savedProcessableParts) {
            byte[] byteArray = Files.toByteArray(describedStorageContent.getContent().getFile());
            if (byteArray != null) {
                ByteArrayInputStream byteArrayInputStream = null;
                List list = null;
                List<File> list2 = null;
                try {
                    try {
                        long currentTimeMillis = System.currentTimeMillis();
                        PdfImageExtractor pdfImageExtractor = new PdfImageExtractor();
                        pdfImageExtractor.setPdfInBytes(byteArray);
                        byteArrayInputStream = new ByteArrayInputStream(byteArray);
                        list2 = pdfImageExtractor.extractImages(byteArrayInputStream, id + "pdfToTextViaOCR");
                        if (this.tesseract == null) {
                            this.tesseract = new Tesseract();
                        }
                        this.tesseract.setLang(TesseractLanguage.byCode(NlmProcessorHelper.stringValue(NlmProcessorHelper.parseNLM(enhancerProcessMessage.getMessageNLM()), NlmConstants.XPATH_LANG, NlmConstants.XLINK_NAMESPACE), TesseractLanguage.English).getTesseractCode());
                        List<File> run = this.tesseract.run(list2, "TESSERACT-txt-" + id);
                        list2.clear();
                        String stringFromFiles = getStringFromFiles(run);
                        run.clear();
                        log.trace("STATS#{}#{};{}", id, Integer.valueOf(i), Long.valueOf(System.currentTimeMillis() - currentTimeMillis));
                        if (byteArrayInputStream != null) {
                            byteArrayInputStream.close();
                        }
                        String sanitizeText = sanitizeText(stringFromFiles);
                        if (isGarbage(sanitizeText)) {
                            log.debug("Extracted text for doc with id: {}#{} was considered as garbage.", id, Integer.valueOf(i));
                        } else {
                            storedContentPartBuilder.addContent("", sanitizeText.getBytes("UTF-8"), describedStorageContent.getSpecificUses());
                            storedContentPartBuilder2.addContent("", sanitizeText.getBytes("UTF-8"), describedStorageContent.getSpecificUses());
                            log.debug("Adding plain text of {}#{}", id, Integer.valueOf(i));
                        }
                    } catch (Exception e) {
                        log.trace("STATS#{}#{};EXCEPTION:{}", id, Integer.valueOf(i), e.getMessage());
                        if (list2 == null || !list2.isEmpty()) {
                        }
                        if (0 != 0 && !list.isEmpty()) {
                            Tools.deleteFilesFromList((List<File>) null);
                        }
                        if (byteArrayInputStream != null) {
                            byteArrayInputStream.close();
                        }
                        return enhancerProcessMessage;
                    }
                } catch (Throwable th) {
                    if (byteArrayInputStream != null) {
                        byteArrayInputStream.close();
                    }
                    throw th;
                }
            }
            i++;
        }
        StoredContentPart build = storedContentPartBuilder.build();
        StoredContentPart build2 = storedContentPartBuilder2.build();
        if (build2 != null) {
            enhancerProcessMessage.addContentPart(build);
            enhancerProcessMessage.addContentPart(build2);
        }
        return enhancerProcessMessage;
    }

    /* JADX WARN: Finally extract failed */
    private String getStringFromFiles(List<File> list) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < list.size(); i++) {
            FileInputStream fileInputStream = null;
            BufferedReader bufferedReader = null;
            try {
                try {
                    fileInputStream = new FileInputStream(list.get(i));
                    bufferedReader = new BufferedReader(new InputStreamReader(fileInputStream));
                    while (true) {
                        String readLine = bufferedReader.readLine();
                        if (readLine == null) {
                            break;
                        }
                        sb.append(readLine);
                        sb.append(NEW_LINE);
                    }
                    if (fileInputStream != null) {
                        try {
                            bufferedReader.close();
                            fileInputStream.close();
                        } catch (IOException e) {
                            log.debug("unable to close a file stream", (Throwable) e);
                        }
                    }
                } catch (Throwable th) {
                    if (fileInputStream != null) {
                        try {
                            bufferedReader.close();
                            fileInputStream.close();
                        } catch (IOException e2) {
                            log.debug("unable to close a file stream", (Throwable) e2);
                        }
                    }
                    throw th;
                }
            } catch (FileNotFoundException e3) {
                log.error("given file not found", (Throwable) e3);
                if (fileInputStream != null) {
                    try {
                        bufferedReader.close();
                        fileInputStream.close();
                    } catch (IOException e4) {
                        log.debug("unable to close a file stream", (Throwable) e4);
                    }
                }
            } catch (IOException e5) {
                log.error("unable to get text from file " + list.get(i), (Throwable) e5);
                if (fileInputStream != null) {
                    try {
                        bufferedReader.close();
                        fileInputStream.close();
                    } catch (IOException e6) {
                        log.debug("unable to close a file stream", (Throwable) e6);
                    }
                }
            }
        }
        return sb.toString();
    }

    @Required
    public void setTesseract(Tesseract tesseract) {
        this.tesseract = tesseract;
    }
}
