package org.apache.mahout.utils.clustering;

import java.io.File;
import java.io.FileWriter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.option.DefaultOption;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.jobcontrol.Job;
import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.math.Vector;
import org.apache.mahout.utils.nlp.collocations.llr.CollocDriver;
import org.apache.mahout.utils.vectors.VectorHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/mahout/utils/clustering/ClusterDumper.class */
public final class ClusterDumper {
    private static final Logger log = LoggerFactory.getLogger(ClusterDumper.class);
    private final String seqFileDir;
    private final String pointsDir;
    private String termDictionary;
    private String dictionaryFormat;
    private String outputFile;
    private int subString = Integer.MAX_VALUE;
    private int numTopFeatures = 10;
    private Map<String, List<String>> clusterIdToPoints = null;
    private boolean useJSON = false;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/apache/mahout/utils/clustering/ClusterDumper$TermIndexWeight.class */
    public static class TermIndexWeight {
        int index;
        double weight;

        TermIndexWeight(int i, double d) {
            this.index = -1;
            this.weight = 0.0d;
            this.index = i;
            this.weight = d;
        }
    }

    public ClusterDumper(String str, String str2) throws IOException {
        this.seqFileDir = str;
        this.pointsDir = str2;
        init();
    }

    private void init() throws IOException {
        if (this.pointsDir == null) {
            this.clusterIdToPoints = Collections.emptyMap();
        } else {
            this.clusterIdToPoints = readPoints(this.pointsDir, new JobConf(Job.class));
        }
    }

    public void printClusters() throws IOException, InstantiationException, IllegalAccessException {
        JobClient jobClient = new JobClient();
        JobConf jobConf = new JobConf(Job.class);
        jobClient.setConf(jobConf);
        String[] strArr = null;
        if (this.termDictionary != null) {
            if (this.dictionaryFormat.equals("text")) {
                strArr = VectorHelper.loadTermDictionary(new File(this.termDictionary));
            } else {
                if (!this.dictionaryFormat.equals("sequencefile")) {
                    throw new IllegalArgumentException("Invalid dictionary format");
                }
                strArr = VectorHelper.loadTermDictionary(jobConf, FileSystem.get(new Path(this.termDictionary).toUri(), jobConf), this.termDictionary);
            }
        }
        OutputStreamWriter fileWriter = this.outputFile != null ? new FileWriter(this.outputFile) : new OutputStreamWriter(System.out);
        for (File file : new File(this.seqFileDir).listFiles(new FilenameFilter() { // from class: org.apache.mahout.utils.clustering.ClusterDumper.1
            @Override // java.io.FilenameFilter
            public boolean accept(File file2, String str) {
                return !str.endsWith(".crc");
            }
        })) {
            if (file.isFile()) {
                Path path = new Path(file.getAbsolutePath());
                System.out.println("Input Path: " + path);
                SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(path.toUri(), jobConf), path, jobConf);
                Writable writable = (Writable) reader.getKeyClass().newInstance();
                ClusterBase clusterBase = (ClusterBase) reader.getValueClass().newInstance();
                while (reader.next(writable, clusterBase)) {
                    Vector center = clusterBase.getCenter();
                    String asFormatString = this.useJSON ? center.asFormatString() : VectorHelper.vectorToString(center, strArr);
                    fileWriter.append((CharSequence) "Id: ").append((CharSequence) String.valueOf(clusterBase.getId())).append((CharSequence) ":");
                    fileWriter.append((CharSequence) "name:").append((CharSequence) center.getName());
                    if (this.subString > 0) {
                        fileWriter.append((CharSequence) ":").append((CharSequence) asFormatString.substring(0, Math.min(this.subString, asFormatString.length())));
                    }
                    fileWriter.append('\n');
                    if (strArr != null) {
                        String topFeatures = getTopFeatures(center, strArr, this.numTopFeatures);
                        fileWriter.write("\tTop Terms: ");
                        fileWriter.write(topFeatures);
                        fileWriter.write(10);
                    }
                    List<String> list = this.clusterIdToPoints.get(String.valueOf(clusterBase.getId()));
                    if (list != null) {
                        fileWriter.write("\tPoints: ");
                        Iterator<String> it = list.iterator();
                        while (it.hasNext()) {
                            fileWriter.append((CharSequence) it.next());
                            if (it.hasNext()) {
                                fileWriter.append((CharSequence) ", ");
                            }
                        }
                        fileWriter.write(10);
                    }
                    fileWriter.flush();
                }
                reader.close();
            }
        }
        if (this.outputFile != null) {
            fileWriter.flush();
            fileWriter.close();
        }
    }

    public String getOutputFile() {
        return this.outputFile;
    }

    public void setOutputFile(String str) {
        this.outputFile = str;
    }

    public int getSubString() {
        return this.subString;
    }

    public void setSubString(int i) {
        this.subString = i;
    }

    public Map<String, List<String>> getClusterIdToPoints() {
        return this.clusterIdToPoints;
    }

    public String getTermDictionary() {
        return this.termDictionary;
    }

    public void setTermDictionary(String str, String str2) {
        this.termDictionary = str;
        this.dictionaryFormat = str2;
    }

    public void setNumTopFeatures(int i) {
        this.numTopFeatures = i;
    }

    public int getNumTopFeatures() {
        return this.numTopFeatures;
    }

    public static void main(String[] strArr) throws IOException, IllegalAccessException, InstantiationException {
        DefaultOptionBuilder defaultOptionBuilder = new DefaultOptionBuilder();
        ArgumentBuilder argumentBuilder = new ArgumentBuilder();
        GroupBuilder groupBuilder = new GroupBuilder();
        DefaultOption create = defaultOptionBuilder.withLongName("seqFileDir").withRequired(false).withArgument(argumentBuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create()).withDescription("The directory containing Sequence Files for the Clusters").withShortName("s").create();
        DefaultOption create2 = defaultOptionBuilder.withLongName(CollocDriver.DEFAULT_OUTPUT_DIRECTORY).withRequired(false).withArgument(argumentBuilder.withName(CollocDriver.DEFAULT_OUTPUT_DIRECTORY).withMinimum(1).withMaximum(1).create()).withDescription("The output file.  If not specified, dumps to the console").withShortName("o").create();
        DefaultOption create3 = defaultOptionBuilder.withLongName("substring").withRequired(false).withArgument(argumentBuilder.withName("substring").withMinimum(1).withMaximum(1).create()).withDescription("The number of chars of the asFormatString() to print").withShortName("b").create();
        DefaultOption create4 = defaultOptionBuilder.withLongName("numWords").withRequired(false).withArgument(argumentBuilder.withName("numWords").withMinimum(1).withMaximum(1).create()).withDescription("The number of top terms to print").withShortName("n").create();
        DefaultOption create5 = defaultOptionBuilder.withLongName("json").withRequired(false).withDescription("Output the centroid as JSON.  Otherwise it substitues in the terms for vector cell entries").withShortName("j").create();
        DefaultOption create6 = defaultOptionBuilder.withLongName("pointsDir").withRequired(false).withArgument(argumentBuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).withDescription("The directory containing points sequence files mapping input vectors to their cluster.  If specified, then the program will output the points associated with a cluster").withShortName("p").create();
        DefaultOption create7 = defaultOptionBuilder.withLongName("dictionary").withRequired(false).withArgument(argumentBuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription("The dictionary file. ").withShortName("d").create();
        DefaultOption create8 = defaultOptionBuilder.withLongName("dictionaryType").withRequired(false).withArgument(argumentBuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription("The dictionary file type (text|sequencefile)").withShortName("dt").create();
        DefaultOption create9 = defaultOptionBuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
        Group create10 = groupBuilder.withName("Options").withOption(create9).withOption(create).withOption(create2).withOption(create3).withOption(create6).withOption(create5).withOption(create7).withOption(create8).withOption(create4).create();
        try {
            Parser parser = new Parser();
            parser.setGroup(create10);
            CommandLine parse = parser.parse(strArr);
            if (parse.hasOption(create9)) {
                CommandLineUtil.printHelp(create10);
                return;
            }
            if (parse.hasOption(create)) {
                String obj = parse.getValue(create).toString();
                String str = null;
                if (parse.hasOption(create7)) {
                    str = parse.getValue(create7).toString();
                }
                String str2 = null;
                if (parse.hasOption(create6)) {
                    str2 = parse.getValue(create6).toString();
                }
                String str3 = null;
                if (parse.hasOption(create2)) {
                    str3 = parse.getValue(create2).toString();
                }
                int i = -1;
                if (parse.hasOption(create3)) {
                    i = Integer.parseInt(parse.getValue(create3).toString());
                }
                ClusterDumper clusterDumper = new ClusterDumper(obj, str2);
                if (parse.hasOption(create5)) {
                    clusterDumper.setUseJSON(true);
                }
                if (str3 != null) {
                    clusterDumper.setOutputFile(str3);
                }
                String obj2 = parse.hasOption(create8) ? parse.getValue(create8).toString() : "text";
                if (str != null) {
                    clusterDumper.setTermDictionary(str, obj2);
                }
                if (parse.hasOption(create4)) {
                    clusterDumper.setNumTopFeatures(Integer.parseInt(parse.getValue(create4).toString()));
                }
                if (i >= 0) {
                    clusterDumper.setSubString(i);
                }
                clusterDumper.printClusters();
            }
        } catch (OptionException e) {
            log.error("Exception", e);
            CommandLineUtil.printHelp(create10);
        }
    }

    private void setUseJSON(boolean z) {
        this.useJSON = z;
    }

    private static Map<String, List<String>> readPoints(String str, JobConf jobConf) throws IOException {
        TreeMap treeMap = new TreeMap();
        for (File file : new File(str).listFiles(new FilenameFilter() { // from class: org.apache.mahout.utils.clustering.ClusterDumper.2
            @Override // java.io.FilenameFilter
            public boolean accept(File file2, String str2) {
                return !str2.endsWith(".crc");
            }
        })) {
            if (file.isFile()) {
                Path path = new Path(file.getAbsolutePath());
                SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(path.toUri(), jobConf), path, jobConf);
                try {
                    Text text = (Text) reader.getKeyClass().newInstance();
                    Text text2 = (Text) reader.getValueClass().newInstance();
                    while (reader.next(text, text2)) {
                        String text3 = text2.toString();
                        List list = (List) treeMap.get(text3);
                        if (list == null) {
                            list = new ArrayList();
                            treeMap.put(text3, list);
                        }
                        list.add(text.toString());
                    }
                } catch (IllegalAccessException e) {
                    log.error("Exception", e);
                } catch (InstantiationException e2) {
                    log.error("Exception", e2);
                }
            }
        }
        return treeMap;
    }

    private static String getTopFeatures(Vector vector, String[] strArr, int i) {
        ArrayList arrayList = new ArrayList();
        Iterator iterateNonZero = vector.iterateNonZero();
        while (iterateNonZero.hasNext()) {
            Vector.Element element = (Vector.Element) iterateNonZero.next();
            arrayList.add(new TermIndexWeight(element.index(), element.get()));
        }
        Collections.sort(arrayList, new Comparator<TermIndexWeight>() { // from class: org.apache.mahout.utils.clustering.ClusterDumper.3
            @Override // java.util.Comparator
            public int compare(TermIndexWeight termIndexWeight, TermIndexWeight termIndexWeight2) {
                return Double.compare(termIndexWeight2.weight, termIndexWeight.weight);
            }
        });
        LinkedList<Pair> linkedList = new LinkedList();
        for (int i2 = 0; i2 < arrayList.size() && i2 < i; i2++) {
            int i3 = ((TermIndexWeight) arrayList.get(i2)).index;
            String str = strArr[i3];
            if (str == null) {
                log.error("Dictionary entry missing for {}", Integer.valueOf(i3));
            } else {
                linkedList.add(new Pair(str, Double.valueOf(((TermIndexWeight) arrayList.get(i2)).weight)));
            }
        }
        StringBuilder sb = new StringBuilder();
        for (Pair pair : linkedList) {
            String str2 = (String) pair.getFirst();
            sb.append("\n\t\t");
            sb.append(StringUtils.rightPad(str2, 40));
            sb.append("=>");
            sb.append(StringUtils.leftPad(((Double) pair.getSecond()).toString(), 20));
        }
        return sb.toString();
    }
}
