package org.apache.mahout.utils.vectors.tfidf;

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.utils.vectors.common.PartialVectorMerger;
import org.apache.mahout.utils.vectors.text.term.TermDocumentCountMapper;
import org.apache.mahout.utils.vectors.text.term.TermDocumentCountReducer;

/* loaded from: input_file:org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.class */
public final class TFIDFConverter {
    public static final String VECTOR_COUNT = "vector.count";
    public static final String FEATURE_COUNT = "feature.count";
    public static final String MIN_DF = "min.df";
    public static final String MAX_DF_PERCENTAGE = "max.df.percentage";
    public static final String TFIDF_OUTPUT_FOLDER = "/tfidf";
    private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "/vectors";
    private static final String FREQUENCY_FILE = "/frequency.file-";
    private static final int MAX_CHUNKSIZE = 10000;
    private static final int MIN_CHUNKSIZE = 100;
    private static final String OUTPUT_FILES_PATTERN = "/part-*";
    private static final int SEQUENCEFILE_BYTE_OVERHEAD = 45;
    private static final String VECTOR_OUTPUT_FOLDER = "/partial-vectors-";
    private static final String WORDCOUNT_OUTPUT_FOLDER = "/df-count";

    private TFIDFConverter() {
    }

    public static void processTfIdf(String str, String str2, int i, int i2, int i3, float f, boolean z) throws IOException {
        if (i < MIN_CHUNKSIZE) {
            i = MIN_CHUNKSIZE;
        } else if (i > MAX_CHUNKSIZE) {
            i = MAX_CHUNKSIZE;
        }
        if (f != -1.0f && f < 0.0f) {
            throw new IllegalArgumentException("normPower must either be -1 or >= 0");
        }
        if (i2 < 1) {
            i2 = 1;
        }
        if (i3 < 0 || i3 > MIN_CHUNKSIZE) {
            i3 = 99;
        }
        Path path = new Path(str);
        Path path2 = new Path(str2 + WORDCOUNT_OUTPUT_FOLDER);
        startDFCounting(path, path2);
        Pair<Long[], List<Path>> createDictionaryChunks = createDictionaryChunks(path2, str2, i);
        int i4 = 0;
        ArrayList arrayList = new ArrayList();
        List<Path> list = (List) createDictionaryChunks.getSecond();
        for (Path path3 : list) {
            int i5 = i4;
            i4++;
            Path path4 = getPath(str2 + VECTOR_OUTPUT_FOLDER, i5);
            arrayList.add(path4);
            makePartialVectors(str, ((Long[]) createDictionaryChunks.getFirst())[0], ((Long[]) createDictionaryChunks.getFirst())[1], i2, i3, path3, path4, z);
        }
        FileSystem fileSystem = FileSystem.get(((Path) arrayList.get(0)).toUri(), new Configuration());
        String str3 = str2 + "/vectors";
        if (list.size() > 1) {
            PartialVectorMerger.mergePartialVectors(arrayList, str3, f, (int) ((Long[]) createDictionaryChunks.getFirst())[0].longValue(), z);
            HadoopUtil.deletePaths(arrayList, fileSystem);
        } else {
            Path path5 = (Path) arrayList.get(0);
            HadoopUtil.deletePath(str3, fileSystem);
            HadoopUtil.rename(path5, new Path(str3), fileSystem);
        }
    }

    private static Pair<Long[], List<Path>> createDictionaryChunks(Path path, String str, int i) throws IOException {
        ArrayList arrayList = new ArrayList();
        IntWritable intWritable = new IntWritable();
        LongWritable longWritable = new LongWritable();
        Configuration configuration = new Configuration();
        FileSystem fileSystem = FileSystem.get(path.toUri(), configuration);
        FileStatus[] globStatus = fileSystem.globStatus(new Path(path.toString() + OUTPUT_FILES_PATTERN));
        long j = i * 1024 * 1024;
        int i2 = 0;
        Path path2 = getPath(str + FREQUENCY_FILE, 0);
        arrayList.add(path2);
        SequenceFile.Writer writer = new SequenceFile.Writer(fileSystem, configuration, path2, IntWritable.class, LongWritable.class);
        long j2 = 0;
        long j3 = 0;
        long j4 = Long.MAX_VALUE;
        for (FileStatus fileStatus : globStatus) {
            SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, fileStatus.getPath(), configuration);
            while (reader.next(intWritable, longWritable)) {
                if (j2 > j) {
                    writer.close();
                    i2++;
                    Path path3 = getPath(str + FREQUENCY_FILE, i2);
                    arrayList.add(path3);
                    writer = new SequenceFile.Writer(fileSystem, configuration, path3, IntWritable.class, LongWritable.class);
                    j2 = 0;
                }
                j2 += 57;
                if (intWritable.get() >= 0) {
                    writer.append(intWritable, longWritable);
                } else if (intWritable.get() == -1) {
                    j4 = longWritable.get();
                }
                j3 = Math.max(intWritable.get(), j3);
            }
        }
        writer.close();
        return new Pair<>(new Long[]{Long.valueOf(j3 + 1), Long.valueOf(j4)}, arrayList);
    }

    public static Path getPath(String str, int i) {
        return new Path(str + i);
    }

    private static void makePartialVectors(String str, Long l, Long l2, int i, int i2, Path path, Path path2, boolean z) throws IOException {
        JobClient jobClient = new JobClient();
        JobConf jobConf = new JobConf(TFIDFConverter.class);
        jobConf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        jobConf.setJobName(": MakePartialVectors: input-folder: " + str + ", dictionary-file: " + path.toString());
        jobConf.setLong(FEATURE_COUNT, l.longValue());
        jobConf.setLong(VECTOR_COUNT, l2.longValue());
        jobConf.setInt(MIN_DF, i);
        jobConf.setInt(MAX_DF_PERCENTAGE, i2);
        jobConf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, z);
        jobConf.setOutputKeyClass(Text.class);
        jobConf.setOutputValueClass(VectorWritable.class);
        DistributedCache.setCacheFiles(new URI[]{path.toUri()}, jobConf);
        FileInputFormat.setInputPaths(jobConf, new Path[]{new Path(str)});
        FileOutputFormat.setOutputPath(jobConf, path2);
        jobConf.setMapperClass(IdentityMapper.class);
        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setReducerClass(TFIDFPartialVectorReducer.class);
        jobConf.setOutputFormat(SequenceFileOutputFormat.class);
        FileSystem fileSystem = FileSystem.get(path2.toUri(), jobConf);
        if (fileSystem.exists(path2)) {
            fileSystem.delete(path2, true);
        }
        jobClient.setConf(jobConf);
        JobClient.runJob(jobConf);
    }

    private static void startDFCounting(Path path, Path path2) throws IOException {
        JobClient jobClient = new JobClient();
        JobConf jobConf = new JobConf(TFIDFConverter.class);
        jobConf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        jobConf.setJobName("VectorTfIdf Document Frequency Count running over input: " + path.toString());
        jobConf.setOutputKeyClass(IntWritable.class);
        jobConf.setOutputValueClass(LongWritable.class);
        FileInputFormat.setInputPaths(jobConf, new Path[]{path});
        FileOutputFormat.setOutputPath(jobConf, path2);
        jobConf.setMapperClass(TermDocumentCountMapper.class);
        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setCombinerClass(TermDocumentCountReducer.class);
        jobConf.setReducerClass(TermDocumentCountReducer.class);
        jobConf.setOutputFormat(SequenceFileOutputFormat.class);
        FileSystem fileSystem = FileSystem.get(path2.toUri(), jobConf);
        if (fileSystem.exists(path2)) {
            fileSystem.delete(path2, true);
        }
        jobClient.setConf(jobConf);
        JobClient.runJob(jobConf);
    }
}
