package eu.eudml.enhancement.pdf2textviaocr.node;

import eu.eudml.enhancement.bibref.EnhancementUtils;
import eu.eudml.enhancement.pdf.PdfExtractorAbstractNode;
import eu.eudml.enhancement.pdf2textviaocr.PdfImageExtractor;
import eu.eudml.enhancement.pdf2textviaocr.Tesseract;
import eu.eudml.enhancement.pdf2textviaocr.TesseractLanguage;
import eu.eudml.enhancement.pdf2textviaocr.Tools;
import eu.eudml.processing.message.EnhancerProcessMessage;
import eu.eudml.service.process.StoredContentPart;
import eu.eudml.service.storage.ContentPart;
import eu.eudml.util.nlm.NlmConstants;
import eu.eudml.util.nlm.NlmProcessorHelper;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Required;
import pl.edu.icm.yadda.process.ctx.ProcessContext;

/* loaded from: input_file:eu/eudml/enhancement/pdf2textviaocr/node/ImageTextExtractorNode.class */
public class ImageTextExtractorNode extends PdfExtractorAbstractNode {
    private static final Logger log = LoggerFactory.getLogger(ImageTextExtractorNode.class);
    private Tesseract tesseract;

    @Override // eu.eudml.enhancement.pdf.PdfExtractorAbstractNode
    public EnhancerProcessMessage process(EnhancerProcessMessage enhancerProcessMessage, ProcessContext processContext) throws Exception {
        if (enhancerProcessMessage.getSourceRecord() == null) {
            throw new NullPointerException();
        }
        ArrayList arrayList = new ArrayList();
        Iterator<StoredContentPart> it = enhancerProcessMessage.getAddedContentParts().iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().getMeta().getPartId());
        }
        if (arrayList.contains("enhanced/text")) {
            return enhancerProcessMessage;
        }
        String id = enhancerProcessMessage.getId();
        byte[] content = EnhancementUtils.getContent(this.storage, enhancerProcessMessage, "content/raw_content/pdf");
        if (content == null) {
            return enhancerProcessMessage;
        }
        ByteArrayInputStream byteArrayInputStream = null;
        List<File> list = null;
        List<File> list2 = null;
        try {
            try {
                long currentTimeMillis = System.currentTimeMillis();
                PdfImageExtractor pdfImageExtractor = new PdfImageExtractor();
                pdfImageExtractor.setPdfInBytes(content);
                byteArrayInputStream = new ByteArrayInputStream(content);
                list2 = pdfImageExtractor.extractImages(byteArrayInputStream, id + "pdfToTextViaOCR");
                if (this.tesseract == null) {
                    this.tesseract = new Tesseract();
                }
                this.tesseract.setLang(TesseractLanguage.byCode(NlmProcessorHelper.stringValue(NlmProcessorHelper.parseNLM(enhancerProcessMessage.getMessageNLM()), NlmConstants.XPATH_LANG, NlmConstants.XLINK_NAMESPACE), TesseractLanguage.English).getTesseractCode());
                list = this.tesseract.run(list2, "enhanced/text");
                list2.clear();
                String stringFromFiles = getStringFromFiles(list);
                list.clear();
                log.trace("STATS#{};{}", new Object[]{id, Long.valueOf(System.currentTimeMillis() - currentTimeMillis)});
                if (byteArrayInputStream != null) {
                    byteArrayInputStream.close();
                }
                String sanitizeText = sanitizeText(stringFromFiles);
                if (isGarbage(sanitizeText)) {
                    log.debug("Extracted text for doc with id: {} was considered as garbage.", id);
                } else {
                    byte[] bytes = sanitizeText.getBytes("UTF-8");
                    ContentPart contentPart = new ContentPart(id, "enhanced/text", ContentPart.ContentPartType.PLAINTEXT_INDEX, bytes.length, "text/plain", id + ".txt", new Date());
                    log.debug("Adding plain text of " + id);
                    enhancerProcessMessage.addContentPart(contentPart, bytes);
                }
                return enhancerProcessMessage;
            } catch (Exception e) {
                log.trace("STATS#{};EXCEPTION:{}", new Object[]{id, e.getMessage()});
                if (list2 == null || !list2.isEmpty()) {
                }
                if (list != null && !list.isEmpty()) {
                    Tools.deleteFilesFromList(list);
                }
                if (byteArrayInputStream != null) {
                    byteArrayInputStream.close();
                }
                return enhancerProcessMessage;
            }
        } catch (Throwable th) {
            if (byteArrayInputStream != null) {
                byteArrayInputStream.close();
            }
            throw th;
        }
    }

    /* JADX WARN: Finally extract failed */
    public String getStringFromFiles(List<File> list) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < list.size(); i++) {
            FileInputStream fileInputStream = null;
            BufferedReader bufferedReader = null;
            try {
                try {
                    fileInputStream = new FileInputStream(list.get(i));
                    bufferedReader = new BufferedReader(new InputStreamReader(fileInputStream));
                    while (true) {
                        String readLine = bufferedReader.readLine();
                        if (readLine == null) {
                            break;
                        }
                        sb.append(readLine);
                        sb.append(getNewline());
                    }
                    if (fileInputStream != null) {
                        try {
                            bufferedReader.close();
                            fileInputStream.close();
                        } catch (IOException e) {
                            log.debug("unable to close a file stream", e);
                        }
                    }
                } catch (Throwable th) {
                    if (fileInputStream != null) {
                        try {
                            bufferedReader.close();
                            fileInputStream.close();
                        } catch (IOException e2) {
                            log.debug("unable to close a file stream", e2);
                        }
                    }
                    throw th;
                }
            } catch (FileNotFoundException e3) {
                log.error("given file not found", e3);
                if (fileInputStream != null) {
                    try {
                        bufferedReader.close();
                        fileInputStream.close();
                    } catch (IOException e4) {
                        log.debug("unable to close a file stream", e4);
                    }
                }
            } catch (IOException e5) {
                log.error("unable to get text from file " + list.get(i), e5);
                if (fileInputStream != null) {
                    try {
                        bufferedReader.close();
                        fileInputStream.close();
                    } catch (IOException e6) {
                        log.debug("unable to close a file stream", e6);
                    }
                }
            }
        }
        return sb.toString();
    }

    private String getNewline() {
        return System.getProperty("line.separator");
    }

    @Required
    public void setTesseract(Tesseract tesseract) {
        this.tesseract = tesseract;
    }
}
