package org.apache.mahout.utils.vectors.lucene;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.option.DefaultOption;
import org.apache.lucene.document.SetBasedFieldSelector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.OpenBitSet;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.math.stats.LogLikelihood;
import org.apache.mahout.utils.clustering.ClusterDumper;
import org.apache.mahout.utils.nlp.collocations.llr.CollocDriver;
import org.apache.mahout.utils.vectors.TermEntry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/mahout/utils/vectors/lucene/ClusterLabels.class */
public class ClusterLabels {
    private static final Logger log = LoggerFactory.getLogger(ClusterLabels.class);
    public static final int DEFAULT_MIN_IDS = 50;
    public static final int DEFAULT_MAX_LABELS = 25;
    private final String seqFileDir;
    private final String pointsDir;
    private final String indexDir;
    private final String contentField;
    private String idField;
    private Map<String, List<String>> clusterIdToPoints = null;
    private String output;
    private int minNumIds;
    private int maxLabels;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/apache/mahout/utils/vectors/lucene/ClusterLabels$TermInfoClusterInOut.class */
    public class TermInfoClusterInOut implements Comparable<TermInfoClusterInOut> {
        final String term;
        final int inClusterDF;
        final int outClusterDF;
        double logLikelihoodRatio;

        TermInfoClusterInOut(String str, int i, int i2) {
            this.term = str;
            this.inClusterDF = i;
            this.outClusterDF = i2;
        }

        @Override // java.lang.Comparable
        public int compareTo(TermInfoClusterInOut termInfoClusterInOut) {
            int i = -Double.compare(this.logLikelihoodRatio, termInfoClusterInOut.logLikelihoodRatio);
            if (i == 0) {
                i = this.term.compareTo(termInfoClusterInOut.term);
            }
            return i;
        }

        public int getInClusterDiff() {
            return this.inClusterDF - this.outClusterDF;
        }
    }

    public ClusterLabels(String str, String str2, String str3, String str4, int i, int i2) throws IOException {
        this.minNumIds = 50;
        this.maxLabels = 25;
        this.seqFileDir = str;
        this.pointsDir = str2;
        this.indexDir = str3;
        this.contentField = str4;
        this.minNumIds = i;
        this.maxLabels = i2;
        init();
    }

    private void init() throws IOException {
        this.clusterIdToPoints = new ClusterDumper(this.seqFileDir, this.pointsDir).getClusterIdToPoints();
    }

    public void getLabels() throws IOException {
        OutputStreamWriter fileWriter = this.output != null ? new FileWriter(this.output) : new OutputStreamWriter(System.out);
        for (Map.Entry<String, List<String>> entry : this.clusterIdToPoints.entrySet()) {
            List<String> value = entry.getValue();
            List<TermInfoClusterInOut> clusterLabels = getClusterLabels(entry.getKey(), value);
            if (clusterLabels != null) {
                fileWriter.write(10);
                fileWriter.write("Top labels for Cluster " + entry.getKey() + " containing " + value.size() + " vectors");
                fileWriter.write(10);
                fileWriter.write("Term \t\t LLR \t\t In-ClusterDF \t\t Out-ClusterDF ");
                fileWriter.write(10);
                for (TermInfoClusterInOut termInfoClusterInOut : clusterLabels) {
                    fileWriter.write(termInfoClusterInOut.term + "\t\t" + termInfoClusterInOut.logLikelihoodRatio + "\t\t" + termInfoClusterInOut.inClusterDF + "\t\t" + termInfoClusterInOut.outClusterDF);
                    fileWriter.write(10);
                }
            }
        }
        fileWriter.flush();
        if (this.output != null) {
            fileWriter.close();
        }
    }

    protected List<TermInfoClusterInOut> getClusterLabels(String str, List<String> list) throws IOException {
        if (list.size() < this.minNumIds) {
            log.info("Skipping small cluster {} with size: {}", str, Integer.valueOf(list.size()));
            return null;
        }
        log.info("Processing Cluster {} with {} documents", str, Integer.valueOf(list.size()));
        IndexReader open = IndexReader.open(FSDirectory.open(new File(this.indexDir)), false);
        log.info("# of documents in the index {}", Integer.valueOf(open.numDocs()));
        HashSet hashSet = new HashSet();
        hashSet.addAll(list);
        int numDocs = open.numDocs();
        OpenBitSet clusterDocBitset = getClusterDocBitset(open, hashSet, this.idField);
        log.info("Populating term infos from the index");
        TermEnum terms = open.terms(new Term(this.contentField, ""));
        int i = 0;
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        do {
            Term term = terms.term();
            if (term == null || !term.field().equals(this.contentField)) {
                break;
            }
            OpenBitSet openBitSet = new OpenBitSet(open.maxDoc());
            TermDocs termDocs = open.termDocs(term);
            while (termDocs.next()) {
                openBitSet.set(termDocs.doc());
            }
            openBitSet.and(clusterDocBitset);
            int i2 = i;
            i++;
            TermEntry termEntry = new TermEntry(term.text(), i2, (int) openBitSet.cardinality());
            linkedHashMap.put(termEntry.term, termEntry);
        } while (terms.next());
        terms.close();
        LinkedList linkedList = new LinkedList();
        int size = list.size();
        for (TermEntry termEntry2 : linkedHashMap.values()) {
            int docFreq = open.terms(new Term(this.contentField, termEntry2.term)).docFreq() - termEntry2.docFreq;
            int i3 = termEntry2.docFreq;
            TermInfoClusterInOut termInfoClusterInOut = new TermInfoClusterInOut(termEntry2.term, i3, docFreq);
            termInfoClusterInOut.logLikelihoodRatio = scoreDocumentFrequencies(i3, docFreq, size, numDocs);
            linkedList.add(termInfoClusterInOut);
        }
        Collections.sort(linkedList);
        open.close();
        linkedHashMap.clear();
        return linkedList.subList(0, Math.min(linkedList.size(), this.maxLabels));
    }

    private static OpenBitSet getClusterDocBitset(IndexReader indexReader, Set<String> set, String str) throws IOException {
        int numDocs = indexReader.numDocs();
        OpenBitSet openBitSet = new OpenBitSet(numDocs);
        SetBasedFieldSelector setBasedFieldSelector = new SetBasedFieldSelector(Collections.singleton(str), Collections.emptySet());
        for (int i = 0; i < numDocs; i++) {
            if (set.contains(str == null ? Integer.toString(i) : indexReader.document(i, setBasedFieldSelector).get(str))) {
                openBitSet.set(i);
            }
        }
        log.info("Created bitset for in-cluster documents : {}", Long.valueOf(openBitSet.cardinality()));
        return openBitSet;
    }

    private static double scoreDocumentFrequencies(int i, int i2, int i3, int i4) {
        return LogLikelihood.logLikelihoodRatio(i, i3 - i, i2, (i4 - i3) - i2);
    }

    public String getIdField() {
        return this.idField;
    }

    public void setIdField(String str) {
        this.idField = str;
    }

    public String getOutput() {
        return this.output;
    }

    public void setOutput(String str) {
        this.output = str;
    }

    public static void main(String[] strArr) {
        DefaultOptionBuilder defaultOptionBuilder = new DefaultOptionBuilder();
        ArgumentBuilder argumentBuilder = new ArgumentBuilder();
        GroupBuilder groupBuilder = new GroupBuilder();
        DefaultOption create = defaultOptionBuilder.withLongName("dir").withRequired(true).withArgument(argumentBuilder.withName("dir").withMinimum(1).withMaximum(1).create()).withDescription("The Lucene index directory").withShortName("d").create();
        DefaultOption create2 = defaultOptionBuilder.withLongName(CollocDriver.DEFAULT_OUTPUT_DIRECTORY).withRequired(false).withArgument(argumentBuilder.withName(CollocDriver.DEFAULT_OUTPUT_DIRECTORY).withMinimum(1).withMaximum(1).create()).withDescription("The output file. If not specified, the result is printed on console.").withShortName("o").create();
        DefaultOption create3 = defaultOptionBuilder.withLongName("field").withRequired(true).withArgument(argumentBuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription("The content field in the index").withShortName("f").create();
        DefaultOption create4 = defaultOptionBuilder.withLongName("idField").withRequired(false).withArgument(argumentBuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription("The field for the document ID in the index.  If null, then the Lucene internal doc id is used which is prone to error if the underlying index changes").withShortName("i").create();
        DefaultOption create5 = defaultOptionBuilder.withLongName("seqFileDir").withRequired(true).withArgument(argumentBuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create()).withDescription("The directory containing Sequence Files for the Clusters").withShortName("s").create();
        DefaultOption create6 = defaultOptionBuilder.withLongName("pointsDir").withRequired(true).withArgument(argumentBuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).withDescription("The directory containing points sequence files mapping input vectors to their cluster.  ").withShortName("p").create();
        DefaultOption create7 = defaultOptionBuilder.withLongName("minClusterSize").withRequired(false).withArgument(argumentBuilder.withName("minClusterSize").withMinimum(1).withMaximum(1).create()).withDescription("The minimum number of points required in a cluster to print the labels for").withShortName("m").create();
        DefaultOption create8 = defaultOptionBuilder.withLongName("maxLabels").withRequired(false).withArgument(argumentBuilder.withName("maxLabels").withMinimum(1).withMaximum(1).create()).withDescription("The maximum number of labels to print per cluster").withShortName("x").create();
        DefaultOption create9 = defaultOptionBuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
        Group create10 = groupBuilder.withName("Options").withOption(create).withOption(create4).withOption(create2).withOption(create3).withOption(create5).withOption(create6).withOption(create9).withOption(create8).withOption(create7).create();
        try {
            Parser parser = new Parser();
            parser.setGroup(create10);
            CommandLine parse = parser.parse(strArr);
            if (parse.hasOption(create9)) {
                CommandLineUtil.printHelp(create10);
                return;
            }
            String obj = parse.getValue(create5).toString();
            String obj2 = parse.getValue(create6).toString();
            String obj3 = parse.getValue(create).toString();
            String obj4 = parse.getValue(create3).toString();
            String str = null;
            if (parse.hasOption(create4)) {
                str = parse.getValue(create4).toString();
            }
            String str2 = null;
            if (parse.hasOption(create2)) {
                str2 = parse.getValue(create2).toString();
            }
            int i = 25;
            if (parse.hasOption(create8)) {
                i = Integer.parseInt(parse.getValue(create8).toString());
            }
            int i2 = 50;
            if (parse.hasOption(create7)) {
                i2 = Integer.parseInt(parse.getValue(create7).toString());
            }
            ClusterLabels clusterLabels = new ClusterLabels(obj, obj2, obj3, obj4, i2, i);
            if (str != null) {
                clusterLabels.setIdField(str);
            }
            if (str2 != null) {
                clusterLabels.setOutput(str2);
            }
            clusterLabels.getLabels();
        } catch (IOException e) {
            log.error("Exception", e);
        } catch (OptionException e2) {
            log.error("Exception", e2);
            CommandLineUtil.printHelp(create10);
        }
    }
}
