package pl.edu.icm.coansys.kwdextraction.rake;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import pl.edu.icm.coansys.kwdextraction.utils.IntStringPair;

/* loaded from: input_file:pl/edu/icm/coansys/kwdextraction/rake/RakeMap1.class */
public class RakeMap1 extends MapReduceBase implements Mapper<IntStringPair, IntStringPair, Text, Text> {
    public static int MAX_LENGTH;
    public static String PATH_TO_STOPWORDS;
    protected Set<String> stopwords = getStopwords();

    private Set<String> getStopwords() {
        ArrayList arrayList = new ArrayList();
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(PATH_TO_STOPWORDS));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                arrayList.add(readLine);
            }
            bufferedReader.close();
        } catch (IOException e) {
        }
        return new HashSet(arrayList);
    }

    protected String cleanWord(String str) {
        return str.trim().toLowerCase();
    }

    protected boolean isNum(String str) {
        try {
            Double.parseDouble(str);
            return true;
        } catch (NumberFormatException e) {
            return false;
        }
    }

    public void map(IntStringPair intStringPair, IntStringPair intStringPair2, OutputCollector<Text, Text> outputCollector, Reporter reporter) throws IOException {
        String second = intStringPair2.getSecond();
        BreakIterator wordInstance = BreakIterator.getWordInstance();
        wordInstance.setText(second);
        int first = wordInstance.first();
        int i = first;
        while (true) {
            int next = wordInstance.next();
            if (next == -1) {
                return;
            }
            String cleanWord = cleanWord(second.substring(first, next));
            String replaceAll = cleanWord.replaceAll("[^a-zA-Z0-9-\\s]", "");
            if (this.stopwords.contains(cleanWord) || cleanWord.matches("\\W+") || isNum(cleanWord) || !cleanWord.matches(replaceAll)) {
                String cleanWord2 = cleanWord(second.substring(i, first));
                if (cleanWord2.length() > 0) {
                    String[] split = cleanWord2.split("\\s");
                    if (split.length <= MAX_LENGTH) {
                        for (String str : split) {
                            outputCollector.collect(new Text(str), new Text(cleanWord2));
                        }
                    }
                }
                i = next;
            } else if (next == second.length()) {
                String cleanWord3 = cleanWord(second.substring(i, next));
                if (cleanWord3.length() > 0) {
                    String[] split2 = cleanWord3.split("\\s");
                    if (split2.length <= MAX_LENGTH) {
                        for (String str2 : split2) {
                            outputCollector.collect(new Text(str2), new Text(cleanWord3));
                        }
                    }
                }
            }
            first = next;
        }
    }

    public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
        map((IntStringPair) obj, (IntStringPair) obj2, (OutputCollector<Text, Text>) outputCollector, reporter);
    }
}
