package org.apache.mahout.text;

import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.option.DefaultOption;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.util.GenericsUtil;
import org.apache.mahout.classifier.bayes.XmlInputFormat;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.FileLineIterable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/mahout/text/WikipediaToSequenceFile.class */
public final class WikipediaToSequenceFile {
    private static final Logger log = LoggerFactory.getLogger(WikipediaToSequenceFile.class);

    private WikipediaToSequenceFile() {
    }

    public static void main(String[] strArr) throws IOException {
        DefaultOptionBuilder defaultOptionBuilder = new DefaultOptionBuilder();
        ArgumentBuilder argumentBuilder = new ArgumentBuilder();
        GroupBuilder groupBuilder = new GroupBuilder();
        DefaultOption create = defaultOptionBuilder.withLongName("input").withRequired(true).withArgument(argumentBuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription("The input directory path").withShortName("i").create();
        DefaultOption create2 = defaultOptionBuilder.withLongName("output").withRequired(true).withArgument(argumentBuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The output directory Path").withShortName("o").create();
        DefaultOption create3 = defaultOptionBuilder.withLongName("categories").withArgument(argumentBuilder.withName("categories").withMinimum(1).withMaximum(1).create()).withDescription("Location of the categories file.  One entry per line. Will be used to make a string match in Wikipedia Category field").withShortName("c").create();
        DefaultOption create4 = defaultOptionBuilder.withLongName("exactMatch").withDescription("If set, then the category name must exactly match the entry in the categories file. Default is false").withShortName("e").create();
        DefaultOption create5 = defaultOptionBuilder.withLongName("all").withDescription("If set, Select all files. Default is false").withShortName("all").create();
        DefaultOption create6 = defaultOptionBuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
        Group create7 = groupBuilder.withName("Options").withOption(create3).withOption(create).withOption(create2).withOption(create4).withOption(create5).withOption(create6).create();
        Parser parser = new Parser();
        parser.setGroup(create7);
        try {
            CommandLine parse = parser.parse(strArr);
            if (parse.hasOption(create6)) {
                CommandLineUtil.printHelp(create7);
                return;
            }
            String str = (String) parse.getValue(create);
            String str2 = (String) parse.getValue(create2);
            String str3 = parse.hasOption(create3) ? (String) parse.getValue(create3) : "";
            boolean z = false;
            if (parse.hasOption(create5)) {
                z = true;
            }
            runJob(str, str2, str3, parse.hasOption(create4), z);
        } catch (OptionException e) {
            log.error("Exception", e);
            CommandLineUtil.printHelp(create7);
        }
    }

    public static void runJob(String str, String str2, String str3, boolean z, boolean z2) throws IOException {
        JobClient jobClient = new JobClient();
        JobConf jobConf = new JobConf(WikipediaToSequenceFile.class);
        if (log.isInfoEnabled()) {
            log.info("Input: " + str + " Out: " + str2 + " Categories: " + str3 + " All Files: " + z2);
        }
        jobConf.set(XmlInputFormat.START_TAG_KEY, "<page>");
        jobConf.set(XmlInputFormat.END_TAG_KEY, "</page>");
        jobConf.setOutputKeyClass(Text.class);
        jobConf.setOutputValueClass(Text.class);
        jobConf.setBoolean("exact.match.only", z);
        jobConf.setBoolean("all.files", z2);
        FileInputFormat.setInputPaths(jobConf, new Path[]{new Path(str)});
        Path path = new Path(str2);
        FileOutputFormat.setOutputPath(jobConf, path);
        jobConf.setMapperClass(WikipediaMapper.class);
        jobConf.setInputFormat(XmlInputFormat.class);
        jobConf.setReducerClass(IdentityReducer.class);
        jobConf.setOutputFormat(SequenceFileOutputFormat.class);
        jobConf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        FileSystem fileSystem = FileSystem.get(path.toUri(), jobConf);
        if (fileSystem.exists(path)) {
            fileSystem.delete(path, true);
        }
        HashSet hashSet = new HashSet();
        if (str3.length() > 0) {
            Iterator it = new FileLineIterable(new File(str3)).iterator();
            while (it.hasNext()) {
                hashSet.add(((String) it.next()).trim().toLowerCase());
            }
        }
        jobConf.set("wikipedia.categories", new DefaultStringifier(jobConf, GenericsUtil.getClass(hashSet)).toString(hashSet));
        jobClient.setConf(jobConf);
        JobClient.runJob(jobConf);
    }
}
