package ws.palladian.classification.text.evaluation;

import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang3.Validate;
import ws.palladian.classification.evaluation.roc.RocCurves;
import ws.palladian.classification.text.BayesScorer;
import ws.palladian.classification.text.FeatureSettingBuilder;
import ws.palladian.classification.text.evaluation.PalladianTextClassifierOptimizerConfig;
import ws.palladian.core.Instance;
import ws.palladian.core.InstanceBuilder;
import ws.palladian.core.dataset.AbstractDataset;
import ws.palladian.core.dataset.FeatureInformation;
import ws.palladian.core.dataset.FeatureInformationBuilder;
import ws.palladian.core.dataset.split.RandomSplit;
import ws.palladian.core.value.TextValue;
import ws.palladian.helper.ProgressMonitor;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.constants.Language;
import ws.palladian.helper.functional.Function;
import ws.palladian.helper.io.CloseableIterator;
import ws.palladian.helper.io.CloseableIteratorAdapter;
import ws.palladian.helper.io.DelimitedStringHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineIterator;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:ws/palladian/classification/text/evaluation/TwitterSentimentDatasetIterator.class */
public final class TwitterSentimentDatasetIterator extends AbstractDataset {
    private final File datasetFile;
    private final Set<NormalizationOptions> options;
    private final int numLines;

    /* loaded from: input_file:ws/palladian/classification/text/evaluation/TwitterSentimentDatasetIterator$NormalizationOptions.class */
    public enum NormalizationOptions {
        QUERY_TERM,
        USER_NAMES,
        LINKS,
        REPEATED_LETTERS
    }

    public TwitterSentimentDatasetIterator(File file, NormalizationOptions... normalizationOptionsArr) {
        Validate.notNull(file, "datasetFile must not be null", new Object[0]);
        this.datasetFile = file;
        this.options = CollectionHelper.newHashSet(normalizationOptionsArr);
        this.numLines = FileHelper.getNumberOfLines(file);
    }

    public TwitterSentimentDatasetIterator(File file) {
        this(file, NormalizationOptions.values());
    }

    /* renamed from: iterator, reason: merged with bridge method [inline-methods] */
    public CloseableIterator<Instance> m8iterator() {
        LineIterator lineIterator = new LineIterator(this.datasetFile);
        final ProgressMonitor progressMonitor = new ProgressMonitor();
        progressMonitor.startTask(getClass().getSimpleName(), this.numLines);
        return new CloseableIteratorAdapter(CollectionHelper.convert(lineIterator, new Function<String, Instance>() { // from class: ws.palladian.classification.text.evaluation.TwitterSentimentDatasetIterator.1
            public Instance compute(String str) {
                List splitLine = DelimitedStringHelper.splitLine(str, ',', '\"');
                if (splitLine.size() != 6) {
                    throw new IllegalStateException("Expected six columns, got " + splitLine.size() + " in '" + str + "'");
                }
                String str2 = (String) splitLine.get(0);
                String str3 = (String) splitLine.get(5);
                if (TwitterSentimentDatasetIterator.this.options.contains(NormalizationOptions.QUERY_TERM)) {
                    str3 = StringHelper.replaceWord((String) splitLine.get(3), "QUERY_TERM", str3);
                }
                if (TwitterSentimentDatasetIterator.this.options.contains(NormalizationOptions.USER_NAMES)) {
                    str3 = str3.replaceAll("@[^\\s]+", "USERNAME");
                }
                if (TwitterSentimentDatasetIterator.this.options.contains(NormalizationOptions.LINKS)) {
                    str3 = str3.replaceAll("https?://[^\\s]+", "URL");
                }
                if (TwitterSentimentDatasetIterator.this.options.contains(NormalizationOptions.REPEATED_LETTERS)) {
                    str3 = str3.replaceAll("(\\w)\\1{3,}", "$1$1");
                }
                progressMonitor.increment();
                return new InstanceBuilder().setText(str3).create(str2);
            }
        }));
    }

    public FeatureInformation getFeatureInformation() {
        return new FeatureInformationBuilder().set("text", TextValue.class).create();
    }

    public long size() {
        return this.numLines;
    }

    public static void main(String[] strArr) {
        RandomSplit randomSplit = new RandomSplit(new TwitterSentimentDatasetIterator(new File("/Users/pk/Desktop/training.1600000.processed.noemoticon.csv")).buffer(), 0.5d);
        PalladianTextClassifierOptimizerConfig.Builder withEvaluator = PalladianTextClassifierOptimizerConfig.withEvaluator(new RocCurves.RocCurvesEvaluator("4"));
        ArrayList arrayList = new ArrayList();
        arrayList.addAll(new FeatureSettingGenerator().words(1, 3).create());
        arrayList.addAll(new FeatureSettingGenerator().chars(3, 10).create());
        arrayList.add(FeatureSettingBuilder.words().language(Language.ENGLISH).removeStopwords().create());
        arrayList.add(FeatureSettingBuilder.words().language(Language.ENGLISH).removeStopwords().stem().create());
        arrayList.add(FeatureSettingBuilder.words().language(Language.ENGLISH).stem().create());
        arrayList.add(FeatureSettingBuilder.words(1, 2).language(Language.ENGLISH).removeStopwords().create());
        arrayList.add(FeatureSettingBuilder.words(1, 2).language(Language.ENGLISH).removeStopwords().stem().create());
        arrayList.add(FeatureSettingBuilder.words(1, 2).language(Language.ENGLISH).stem().create());
        ArrayList arrayList2 = new ArrayList();
        arrayList2.add(new BayesScorer(new BayesScorer.Options[]{BayesScorer.Options.LAPLACE}));
        arrayList2.add(new BayesScorer(new BayesScorer.Options[]{BayesScorer.Options.FREQUENCIES, BayesScorer.Options.LAPLACE}));
        arrayList2.add(new BayesScorer(new BayesScorer.Options[]{BayesScorer.Options.LAPLACE, BayesScorer.Options.PRIORS}));
        arrayList2.add(new BayesScorer(new BayesScorer.Options[]{BayesScorer.Options.FREQUENCIES, BayesScorer.Options.LAPLACE, BayesScorer.Options.PRIORS}));
        withEvaluator.setScorers(arrayList2);
        withEvaluator.setFeatureSettings(arrayList);
        withEvaluator.create().runOptimization(randomSplit.getTrain(), randomSplit.getTest(), "/Users/pk/Desktop/evaluation-result.csv", new ProgressMonitor());
    }
}
