package org.apache.mahout.utils.vectors.text;

import java.io.IOException;
import java.nio.charset.Charset;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.lucene.analysis.Analyzer;
import org.apache.mahout.common.StringTuple;
import org.apache.mahout.utils.vectors.text.document.SequenceFileTokenizerMapper;

/* loaded from: input_file:org/apache/mahout/utils/vectors/text/DocumentProcessor.class */
public final class DocumentProcessor {
    public static final String TOKENIZED_DOCUMENT_OUTPUT_FOLDER = "/tokenized-documents";
    public static final String ANALYZER_CLASS = "analyzer.class";
    public static final Charset CHARSET = Charset.forName("UTF-8");

    private DocumentProcessor() {
    }

    public static void tokenizeDocuments(String str, Class<? extends Analyzer> cls, String str2) throws IOException {
        JobClient jobClient = new JobClient();
        JobConf jobConf = new JobConf(DocumentProcessor.class);
        jobConf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        jobConf.set(ANALYZER_CLASS, cls.getName());
        jobConf.setJobName("DocumentProcessor::DocumentTokenizer: input-folder: " + str);
        jobConf.setOutputKeyClass(Text.class);
        jobConf.setOutputValueClass(StringTuple.class);
        FileInputFormat.setInputPaths(jobConf, new Path[]{new Path(str)});
        Path path = new Path(str2);
        FileOutputFormat.setOutputPath(jobConf, path);
        jobConf.setMapperClass(SequenceFileTokenizerMapper.class);
        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setNumReduceTasks(0);
        jobConf.setOutputFormat(SequenceFileOutputFormat.class);
        FileSystem fileSystem = FileSystem.get(path.toUri(), jobConf);
        if (fileSystem.exists(path)) {
            fileSystem.delete(path, true);
        }
        jobClient.setConf(jobConf);
        JobClient.runJob(jobConf);
    }
}
