package org.apache.mahout.classifier.bayes;

import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.GenericsUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.mahout.analysis.WikipediaAnalyzer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.class */
public class WikipediaDatasetCreatorMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
    private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorMapper.class);
    private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern.compile("[\\s\\W]");
    private static final Pattern OPEN_TEXT_TAG_PATTERN = Pattern.compile("<text xml:space=\"preserve\">");
    private static final Pattern CLOSE_TEXT_TAG_PATTERN = Pattern.compile("</text>");
    private Set<String> inputCategories = null;
    private boolean exactMatchOnly = false;
    private Analyzer analyzer;

    public void map(LongWritable longWritable, Text text, OutputCollector<Text, Text> outputCollector, Reporter reporter) throws IOException {
        StringBuilder sb = new StringBuilder();
        String text2 = text.toString();
        String findMatchingCategory = findMatchingCategory(text2);
        if (findMatchingCategory.equals("Unknown")) {
            return;
        }
        TokenStream tokenStream = this.analyzer.tokenStream(findMatchingCategory, new StringReader(StringEscapeUtils.unescapeHtml(CLOSE_TEXT_TAG_PATTERN.matcher(OPEN_TEXT_TAG_PATTERN.matcher(text2).replaceFirst("")).replaceAll(""))));
        Token token = new Token();
        while (true) {
            Token next = tokenStream.next(token);
            token = next;
            if (next == null) {
                outputCollector.collect(new Text(SPACE_NON_ALPHA_PATTERN.matcher(findMatchingCategory).replaceAll("_")), new Text(sb.toString()));
                return;
            }
            sb.append(token.termBuffer(), 0, token.termLength()).append(' ');
        }
    }

    private String findMatchingCategory(String str) {
        int i = 0;
        while (true) {
            int indexOf = str.indexOf("[[Category:", i);
            if (indexOf == -1) {
                return "Unknown";
            }
            int i2 = indexOf + 11;
            int indexOf2 = str.indexOf("]]", i2);
            if (indexOf2 >= str.length() || indexOf2 < 0) {
                return "Unknown";
            }
            String trim = str.substring(i2, indexOf2).toLowerCase().trim();
            if (this.exactMatchOnly && this.inputCategories.contains(trim)) {
                return trim;
            }
            if (!this.exactMatchOnly) {
                for (String str2 : this.inputCategories) {
                    if (trim.contains(str2)) {
                        return str2;
                    }
                }
            }
            i = indexOf2;
        }
    }

    public void configure(JobConf jobConf) {
        try {
            if (this.inputCategories == null) {
                HashSet hashSet = new HashSet();
                DefaultStringifier defaultStringifier = new DefaultStringifier(jobConf, GenericsUtil.getClass(hashSet));
                this.inputCategories = (Set) defaultStringifier.fromString(jobConf.get("wikipedia.categories", defaultStringifier.toString(hashSet)));
            }
            this.exactMatchOnly = jobConf.getBoolean("exact.match.only", false);
            if (this.analyzer == null) {
                this.analyzer = (Analyzer) Class.forName(jobConf.get("analyzer.class", WikipediaAnalyzer.class.getName())).newInstance();
            }
            log.info("Configure: Input Categories size: " + this.inputCategories.size() + " Exact Match: " + this.exactMatchOnly + " Analyzer: " + this.analyzer.getClass().getName());
        } catch (IOException e) {
            throw new IllegalStateException(e);
        } catch (ClassNotFoundException e2) {
            throw new IllegalStateException(e2);
        } catch (IllegalAccessException e3) {
            throw new IllegalStateException(e3);
        } catch (InstantiationException e4) {
            throw new IllegalStateException(e4);
        }
    }

    public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
        map((LongWritable) obj, (Text) obj2, (OutputCollector<Text, Text>) outputCollector, reporter);
    }
}
