package ws.palladian.extraction.location.experimental;

import java.io.File;
import java.util.HashSet;
import java.util.Iterator;
import org.apache.commons.lang3.Validate;
import ws.palladian.classification.text.DictionaryModel;
import ws.palladian.classification.text.DictionaryTrieModel;
import ws.palladian.classification.text.PruningStrategies;
import ws.palladian.core.Annotation;
import ws.palladian.extraction.entity.Annotations;
import ws.palladian.extraction.entity.FileFormatParser;
import ws.palladian.extraction.entity.tagger.NerHelper;
import ws.palladian.helper.ProgressMonitor;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.functional.Functions;
import ws.palladian.helper.functional.Predicates;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineAction;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:ws/palladian/extraction/location/experimental/PatternAnalyzer.class */
public class PatternAnalyzer {

    /* loaded from: input_file:ws/palladian/extraction/location/experimental/PatternAnalyzer$Direction.class */
    public enum Direction {
        LEFT,
        RIGHT
    }

    public static void extractPatterns(File file, File file2, final Direction direction, final int i, int i2, double d, String... strArr) {
        Validate.notNull(file, "inputFile must not be null", new Object[0]);
        Validate.notNull(file2, "outputPath must not be null", new Object[0]);
        Validate.notNull(direction, "direction must not be null", new Object[0]);
        Validate.isTrue(i > 0, "size must be greater zero", new Object[0]);
        Validate.isTrue(i2 > 0, "minCount must be greater zero", new Object[0]);
        Validate.isTrue(d > 0.0d, "minProb must be greater zero", new Object[0]);
        final DictionaryTrieModel.Builder builder = new DictionaryTrieModel.Builder();
        int numberOfLines = FileHelper.getNumberOfLines(file);
        final HashSet newHashSet = strArr.length != 0 ? CollectionHelper.newHashSet(strArr) : null;
        final ProgressMonitor progressMonitor = new ProgressMonitor();
        progressMonitor.startTask((String) null, numberOfLines);
        FileHelper.performActionOnEveryLine(file, new LineAction() { // from class: ws.palladian.extraction.location.experimental.PatternAnalyzer.1
            public void performAction(String str, int i3) {
                progressMonitor.increment();
                if (str.startsWith("=-DOCSTART-")) {
                    return;
                }
                String normalizeQuotes = StringHelper.normalizeQuotes(StringHelper.replaceProtectedSpace(str));
                Annotations<Annotation> annotationsFromXmlText = FileFormatParser.getAnnotationsFromXmlText(normalizeQuotes);
                String stripHtmlTags = HtmlHelper.stripHtmlTags(normalizeQuotes);
                Iterator<T> it = annotationsFromXmlText.iterator();
                while (it.hasNext()) {
                    Annotation annotation = (Annotation) it.next();
                    if (newHashSet == null || newHashSet.contains(annotation.getTag())) {
                        builder.addDocument(CollectionHelper.filterList(CollectionHelper.convertList(direction == Direction.LEFT ? NerHelper.getLeftContexts(annotation, stripHtmlTags, i) : NerHelper.getRightContexts(annotation, stripHtmlTags, i), Functions.LOWERCASE), Predicates.regex(".{2,}")), annotation.getTag());
                    }
                }
            }
        });
        HashSet hashSet = new HashSet();
        hashSet.add(new PruningStrategies.TermCountPruningStrategy(i2));
        hashSet.add(new PruningStrategies.MinProbabilityPruningStrategy(d));
        builder.setPruningStrategy(Predicates.and(hashSet));
        DictionaryModel dictionaryModel = (DictionaryModel) builder.create();
        System.out.println("Category probabilities: " + dictionaryModel.getDocumentCounts());
        System.out.println(dictionaryModel);
        File file3 = new File(file2, "contexts_" + direction + "_" + System.currentTimeMillis() + ".txt");
        for (DictionaryModel.DictionaryEntry dictionaryEntry : dictionaryModel) {
            String mostLikelyCategory = dictionaryEntry.getCategoryEntries().getMostLikelyCategory();
            FileHelper.appendFile(file3.getPath(), direction == Direction.LEFT ? dictionaryEntry.getTerm() + " *\t" + mostLikelyCategory + "\n" : "* " + dictionaryEntry.getTerm() + "\t" + mostLikelyCategory + "\n");
        }
    }

    public static void main(String[] strArr) throws Exception {
        File file = new File("/Users/pk/Desktop/Wikipedia-EN-entity-dataset/annotations-combined.xml");
        File file2 = new File("/Users/pk/Desktop");
        extractPatterns(file, file2, Direction.RIGHT, 1, 50, 0.9d, "LOC", "PER");
        extractPatterns(file, file2, Direction.LEFT, 1, 50, 0.9d, "LOC", "PER");
    }
}
