package org.apache.mahout.classifier.bayes;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.GenericsUtil;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

/* loaded from: input_file:org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.class */
public class WikipediaDatasetCreatorMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
    private static Set<String> countries = null;

    public void map(LongWritable longWritable, Text text, OutputCollector<Text, Text> outputCollector, Reporter reporter) throws IOException {
        String text2 = text.toString();
        StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
        StringBuilder sb = new StringBuilder();
        String country = getCountry(new HashSet(findAllCategories(text2)));
        if (country.equals("Unknown")) {
            return;
        }
        TokenStream tokenStream = standardAnalyzer.tokenStream(country, new StringReader(StringEscapeUtils.unescapeHtml(text2.replaceFirst("<text xml:space=\"preserve\">", "").replaceAll("</text>", ""))));
        while (true) {
            Token next = tokenStream.next();
            if (next == null) {
                outputCollector.collect(new Text(country.replace(" ", "_")), new Text(sb.toString()));
                return;
            }
            sb.append(next.termBuffer(), 0, next.termLength()).append(' ');
        }
    }

    public static String getCountry(Set<String> set) {
        for (String str : set) {
            for (String str2 : countries) {
                if (str.contains(str2)) {
                    return str2;
                }
            }
        }
        return "Unknown";
    }

    public static List<String> findAllCategories(String str) {
        ArrayList arrayList = new ArrayList();
        int i = 0;
        while (true) {
            int indexOf = str.indexOf("[[Category:", i);
            if (indexOf == -1) {
                break;
            }
            int i2 = indexOf + 11;
            int indexOf2 = str.indexOf("]]", i2);
            if (indexOf2 >= str.length() || indexOf2 < 0) {
                break;
            }
            arrayList.add(str.substring(i2, indexOf2));
            i = indexOf2;
        }
        return arrayList;
    }

    public void configure(JobConf jobConf) {
        try {
            if (countries == null) {
                HashSet hashSet = new HashSet();
                DefaultStringifier defaultStringifier = new DefaultStringifier(jobConf, GenericsUtil.getClass(hashSet));
                countries = (Set) defaultStringifier.fromString(jobConf.get("wikipedia.countries", defaultStringifier.toString(hashSet)));
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
        map((LongWritable) obj, (Text) obj2, (OutputCollector<Text, Text>) outputCollector, reporter);
    }
}
