package pl.edu.icm.coansys.kwdextraction;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.edu.icm.cermine.PdfNLMContentExtractor;
import pl.edu.icm.cermine.PdfRawTextExtractor;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.coansys.importers.models.DocumentProtos;
import pl.edu.icm.coansys.kwdextraction.langident.LanguageIdentifierBean;

/* loaded from: input_file:pl/edu/icm/coansys/kwdextraction/RakeExtractor.class */
public class RakeExtractor {
    private static final Logger logger = LoggerFactory.getLogger(RakeExtractor.class);
    private static final String ILLEGAL_CHARS = "[^\\p{L}0-9-'\\s]";
    private static final int DEFAULT_KEYWORDS_NUMBER = 8;
    private static final Map<Lang, Set<String>> stopwords;
    private String content;
    private Lang lang;
    private ExtractionOption extractionOption;
    private List<KeywordCandidate> keywordCandidates;
    private Map<String, Map<String, Integer>> cooccurrences;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:pl/edu/icm/coansys/kwdextraction/RakeExtractor$ExtractionOption.class */
    public enum ExtractionOption {
        CONTENT(true, false),
        ABSTRACT(false, true),
        CONTENT_AND_ABSTRACT(true, true);

        private boolean fromContent;
        private boolean fromAbstract;

        ExtractionOption(boolean z, boolean z2) {
            this.fromContent = z;
            this.fromAbstract = z2;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:pl/edu/icm/coansys/kwdextraction/RakeExtractor$Lang.class */
    public enum Lang {
        PL("pl", "stopwords_pl.txt"),
        FR("fr", "stopwords_fr.txt"),
        EN("en", "stopwords_en.txt");

        private String langCode;
        private String stopwordsPath;

        Lang(String str, String str2) {
            this.langCode = str;
            this.stopwordsPath = str2;
        }
    }

    public RakeExtractor(String str, String str2) throws IOException {
        setLang(str2);
        this.content = filterTextByLang(str, this.lang.langCode);
        prepareToExtraction();
    }

    public RakeExtractor(byte[] bArr, String str) throws AnalysisException, IOException {
        setLang(str);
        this.content = extractTextFromPdf(bArr, this.lang);
        prepareToExtraction();
    }

    public RakeExtractor(DocumentProtos.DocumentWrapper documentWrapper, String str, String str2) throws IOException {
        setLang(str2);
        setOption(str);
        StringBuilder sb = new StringBuilder();
        if (this.extractionOption.fromContent) {
            for (DocumentProtos.Media media : documentWrapper.getMediaContainer().getMediaList()) {
                if (media.getMediaType().equals("media.type.pdf")) {
                    try {
                        sb.append(extractTextFromPdf(media.getContent().toByteArray(), this.lang));
                    } catch (Exception e) {
                        logger.error("Cannot extract text from PDF: " + e.toString() + " " + media.getSourcePath());
                    }
                } else if (media.getMediaType().equals("media.type.txt")) {
                    sb.append(filterTextByLang(media.getContent().toStringUtf8(), this.lang.langCode));
                }
                sb.append("\n");
            }
        }
        if (this.extractionOption.fromAbstract) {
            Iterator it = documentWrapper.getDocumentMetadata().getDocumentAbstractList().iterator();
            while (it.hasNext()) {
                sb.append(filterTextByLang(((DocumentProtos.TextWithLanguage) it.next()).getText(), this.lang.langCode));
            }
        }
        this.content = sb.toString();
        prepareToExtraction();
    }

    private String extractTextFromPdf(byte[] bArr, Lang lang) throws IOException, AnalysisException {
        String value = new PdfNLMContentExtractor().extractContent(new ByteArrayInputStream(bArr)).getChild("body").getValue();
        if (value == null || value.isEmpty()) {
            value = (String) new PdfRawTextExtractor().extractText(new ByteArrayInputStream(bArr));
        }
        return filterTextByLang(value, lang.langCode);
    }

    private String filterTextByLang(String str, String str2) throws IOException {
        return str2.equals(new LanguageIdentifierBean().classify(str)) ? str : "";
    }

    private void prepareToExtraction() throws IOException {
        extractKeywordCandidates();
        countCooccurrences();
        countMetrics();
    }

    private static Set<String> loadStopwords(Lang lang) throws IOException {
        InputStream resourceAsStream;
        HashSet hashSet = new HashSet();
        try {
            resourceAsStream = RakeExtractor.class.getClassLoader().getResourceAsStream(lang.stopwordsPath);
        } catch (NullPointerException e) {
            resourceAsStream = RakeExtractor.class.getClassLoader().getResourceAsStream("/" + lang.stopwordsPath);
        }
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(resourceAsStream));
        String readLine = bufferedReader.readLine();
        while (true) {
            String str = readLine;
            if (str == null) {
                return hashSet;
            }
            String trim = str.trim();
            if (!trim.isEmpty()) {
                hashSet.add(trim);
            }
            readLine = bufferedReader.readLine();
        }
    }

    /* JADX WARN: Multi-variable type inference failed */
    private void extractKeywordCandidates() {
        HashMap hashMap = new HashMap();
        BreakIterator wordInstance = BreakIterator.getWordInstance();
        wordInstance.setText(this.content);
        int first = wordInstance.first();
        int i = first;
        String str = null;
        KeywordCandidate keywordCandidate = new KeywordCandidate();
        int next = wordInstance.next();
        while (true) {
            int i2 = next;
            if (i2 == -1) {
                break;
            }
            String lowerCase = this.content.substring(first, i2).trim().toLowerCase();
            String replaceAll = lowerCase.replaceAll(ILLEGAL_CHARS, "");
            if (!lowerCase.isEmpty()) {
                if (stopwords.get(this.lang).contains(lowerCase) || lowerCase.matches("\\W+") || isNum(lowerCase) || !lowerCase.equals(replaceAll)) {
                    str = this.content.substring(i, first);
                } else {
                    keywordCandidate.addWord(lowerCase);
                    if (i2 == this.content.length()) {
                        str = this.content.substring(i, i2);
                    }
                }
                if (str != null) {
                    String replaceAll2 = str.trim().toLowerCase().replaceAll(ILLEGAL_CHARS, "").replaceAll("\\s+", " ");
                    if (!replaceAll2.isEmpty()) {
                        if (hashMap.containsKey(replaceAll2)) {
                            ((KeywordCandidate) hashMap.get(replaceAll2)).incCounter();
                        } else {
                            keywordCandidate.setKeyword(replaceAll2);
                            hashMap.put(replaceAll2, keywordCandidate);
                        }
                    }
                    str = null;
                    i = i2;
                    keywordCandidate = new KeywordCandidate();
                }
            }
            first = i2;
            next = wordInstance.next();
        }
        this.keywordCandidates = new ArrayList();
        Iterator it = hashMap.entrySet().iterator();
        while (it.hasNext()) {
            this.keywordCandidates.add(((Map.Entry) it.next()).getValue());
        }
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v43, types: [java.util.Map] */
    private void countCooccurrences() {
        HashMap hashMap;
        this.cooccurrences = new HashMap();
        for (KeywordCandidate keywordCandidate : this.keywordCandidates) {
            for (String str : keywordCandidate.getWords()) {
                if (this.cooccurrences.containsKey(str)) {
                    hashMap = (Map) this.cooccurrences.get(str);
                } else {
                    hashMap = new HashMap();
                    this.cooccurrences.put(str, hashMap);
                }
                for (String str2 : keywordCandidate.getWords()) {
                    int counter = keywordCandidate.getCounter();
                    if (hashMap.containsKey(str2)) {
                        counter += ((Integer) hashMap.get(str2)).intValue() * keywordCandidate.getCounter();
                    }
                    hashMap.put(str2, Integer.valueOf(counter));
                }
            }
        }
    }

    private void countMetrics() {
        HashMap hashMap = new HashMap();
        for (String str : this.cooccurrences.keySet()) {
            int i = 0;
            Iterator<String> it = this.cooccurrences.get(str).keySet().iterator();
            while (it.hasNext()) {
                i += this.cooccurrences.get(str).get(it.next()).intValue();
            }
            hashMap.put(str, Double.valueOf((1.0d * i) / this.cooccurrences.get(str).get(str).intValue()));
        }
        for (KeywordCandidate keywordCandidate : this.keywordCandidates) {
            double d = 0.0d;
            Iterator<String> it2 = keywordCandidate.getWords().iterator();
            while (it2.hasNext()) {
                d += ((Double) hashMap.get(it2.next())).doubleValue();
            }
            keywordCandidate.setScore(d);
        }
        Collections.sort(this.keywordCandidates);
    }

    private List<String> choiceKeywords(int i) {
        int min = Math.min(i, this.keywordCandidates.size());
        ArrayList arrayList = new ArrayList();
        for (int i2 = 0; i2 < min; i2++) {
            arrayList.add(this.keywordCandidates.get(i2).getKeyword());
        }
        return arrayList;
    }

    public List<String> getKeywords() {
        return choiceKeywords(DEFAULT_KEYWORDS_NUMBER);
    }

    public List<String> getKeywords(int i) {
        return choiceKeywords(i);
    }

    private void setLang(String str) {
        if ("fr".equals(str)) {
            this.lang = Lang.FR;
        } else if ("pl".equals(str)) {
            this.lang = Lang.PL;
        } else {
            this.lang = Lang.EN;
        }
    }

    private void setOption(String str) {
        this.extractionOption = ExtractionOption.valueOf(str);
    }

    private static boolean isNum(String str) {
        try {
            Double.parseDouble(str);
            return true;
        } catch (NumberFormatException e) {
            return false;
        }
    }

    public static List<String> getSupportedLanguages() {
        ArrayList arrayList = new ArrayList();
        for (Lang lang : Lang.values()) {
            arrayList.add(lang.langCode);
        }
        return arrayList;
    }

    public static List<String> getAvailableExtractionOptions() {
        ArrayList arrayList = new ArrayList();
        for (ExtractionOption extractionOption : ExtractionOption.values()) {
            arrayList.add(extractionOption.name());
        }
        return arrayList;
    }

    static {
        try {
            stopwords = new EnumMap(Lang.class);
            for (Lang lang : Lang.values()) {
                stopwords.put(lang, loadStopwords(lang));
            }
        } catch (IOException e) {
            logger.error("Unable to load stopwords: " + e);
            throw new RuntimeException(e);
        }
    }
}
