package org.apache.mahout.classifier.bayes;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.GenericsUtil;

/* loaded from: input_file:org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorDriver.class */
public class WikipediaDatasetCreatorDriver {
    private WikipediaDatasetCreatorDriver() {
    }

    public static void main(String[] strArr) throws IOException {
        runJob(strArr[0], strArr[1], strArr[2]);
    }

    public static void runJob(String str, String str2, String str3) throws IOException {
        JobClient jobClient = new JobClient();
        JobConf jobConf = new JobConf(WikipediaDatasetCreatorDriver.class);
        jobConf.set("key.value.separator.in.input.line", " ");
        jobConf.set(XmlInputFormat.START_TAG_KEY, "<text xml:space=\"preserve\">");
        jobConf.set(XmlInputFormat.END_TAG_KEY, "</text>");
        jobConf.setOutputKeyClass(Text.class);
        jobConf.setOutputValueClass(Text.class);
        FileInputFormat.setInputPaths(jobConf, new Path[]{new Path(str)});
        Path path = new Path(str2);
        FileOutputFormat.setOutputPath(jobConf, path);
        jobConf.setMapperClass(WikipediaDatasetCreatorMapper.class);
        jobConf.setNumMapTasks(100);
        jobConf.setInputFormat(XmlInputFormat.class);
        jobConf.setReducerClass(WikipediaDatasetCreatorReducer.class);
        jobConf.setOutputFormat(WikipediaDatasetCreatorOutputFormat.class);
        jobConf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        FileSystem fileSystem = FileSystem.get(jobConf);
        if (fileSystem.exists(path)) {
            fileSystem.delete(path, true);
        }
        HashSet hashSet = new HashSet();
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(str3), "UTF-8"));
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                bufferedReader.close();
                jobConf.set("wikipedia.countries", new DefaultStringifier(jobConf, GenericsUtil.getClass(hashSet)).toString(hashSet));
                jobClient.setConf(jobConf);
                JobClient.runJob(jobConf);
                return;
            }
            hashSet.add(readLine);
        }
    }
}
