package ws.palladian.extraction.text.vector;

import java.util.Iterator;
import java.util.Map;
import org.apache.commons.math3.util.FastMath;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ws.palladian.classification.text.FeatureSetting;
import ws.palladian.classification.text.Preprocessor;
import ws.palladian.core.FeatureVector;
import ws.palladian.core.Instance;
import ws.palladian.core.InstanceBuilder;
import ws.palladian.core.dataset.AbstractDatasetFeatureVectorTransformer;
import ws.palladian.core.dataset.Dataset;
import ws.palladian.core.dataset.FeatureInformation;
import ws.palladian.core.dataset.FeatureInformationBuilder;
import ws.palladian.core.value.ImmutableFloatValue;
import ws.palladian.core.value.NominalValue;
import ws.palladian.core.value.TextValue;
import ws.palladian.core.value.Value;
import ws.palladian.core.value.ValueDefinitions;
import ws.palladian.extraction.feature.MapTermCorpus;
import ws.palladian.extraction.feature.TermCorpus;
import ws.palladian.helper.collection.Bag;
import ws.palladian.helper.collection.CollectionHelper;

/* loaded from: input_file:ws/palladian/extraction/text/vector/TextVectorizer.class */
public class TextVectorizer extends AbstractDatasetFeatureVectorTransformer implements ITextVectorizer {
    private static final Logger LOGGER = LoggerFactory.getLogger(TextVectorizer.class);
    private final String inputFeatureName;
    private final FeatureSetting featureSetting;
    private final Preprocessor preprocessor;
    private final TermCorpus termCorpus;
    private final TFStrategy tfStrategy;
    private final IDFStrategy idfStrategy;
    private final int alpha;

    /* loaded from: input_file:ws/palladian/extraction/text/vector/TextVectorizer$IDFStrategy.class */
    public enum IDFStrategy {
        UNARY { // from class: ws.palladian.extraction.text.vector.TextVectorizer.IDFStrategy.1
            @Override // ws.palladian.extraction.text.vector.TextVectorizer.IDFStrategy
            float calc(int i, int i2, int i3) {
                return 1.0f;
            }
        },
        IDF { // from class: ws.palladian.extraction.text.vector.TextVectorizer.IDFStrategy.2
            @Override // ws.palladian.extraction.text.vector.TextVectorizer.IDFStrategy
            float calc(int i, int i2, int i3) {
                return (float) FastMath.log(i2 / i);
            }
        },
        IDF_SMOOTH { // from class: ws.palladian.extraction.text.vector.TextVectorizer.IDFStrategy.3
            @Override // ws.palladian.extraction.text.vector.TextVectorizer.IDFStrategy
            float calc(int i, int i2, int i3) {
                return (float) FastMath.log(i2 / (i + 1));
            }
        },
        IDF_MAX { // from class: ws.palladian.extraction.text.vector.TextVectorizer.IDFStrategy.4
            @Override // ws.palladian.extraction.text.vector.TextVectorizer.IDFStrategy
            float calc(int i, int i2, int i3) {
                return (float) FastMath.log((i3 * i) / (1 + i));
            }
        };

        abstract float calc(int i, int i2, int i3);
    }

    /* loaded from: input_file:ws/palladian/extraction/text/vector/TextVectorizer$TFStrategy.class */
    public enum TFStrategy {
        BINARY { // from class: ws.palladian.extraction.text.vector.TextVectorizer.TFStrategy.1
            @Override // ws.palladian.extraction.text.vector.TextVectorizer.TFStrategy
            float calc(int i, int i2, int i3) {
                return i > 0 ? 1.0f : 0.0f;
            }
        },
        RAW_COUNT { // from class: ws.palladian.extraction.text.vector.TextVectorizer.TFStrategy.2
            @Override // ws.palladian.extraction.text.vector.TextVectorizer.TFStrategy
            float calc(int i, int i2, int i3) {
                return i;
            }
        },
        TERM_FREQUENCY { // from class: ws.palladian.extraction.text.vector.TextVectorizer.TFStrategy.3
            @Override // ws.palladian.extraction.text.vector.TextVectorizer.TFStrategy
            float calc(int i, int i2, int i3) {
                return i / i2;
            }
        },
        LOG_NORMALIZATION { // from class: ws.palladian.extraction.text.vector.TextVectorizer.TFStrategy.4
            @Override // ws.palladian.extraction.text.vector.TextVectorizer.TFStrategy
            float calc(int i, int i2, int i3) {
                return (float) (1.0d + FastMath.log(i));
            }
        },
        DOUBLE_NORMALIZATION { // from class: ws.palladian.extraction.text.vector.TextVectorizer.TFStrategy.5
            @Override // ws.palladian.extraction.text.vector.TextVectorizer.TFStrategy
            float calc(int i, int i2, int i3) {
                return 0.5f + (0.5f * (i / i3));
            }
        };

        abstract float calc(int i, int i2, int i3);
    }

    public TextVectorizer(String str, FeatureSetting featureSetting, Dataset dataset, TFStrategy tFStrategy, IDFStrategy iDFStrategy, int i) {
        this(str, featureSetting, dataset, tFStrategy, iDFStrategy, i, 0);
    }

    public TextVectorizer(String str, FeatureSetting featureSetting, Dataset dataset, TFStrategy tFStrategy, IDFStrategy iDFStrategy, int i, int i2) {
        this.inputFeatureName = str;
        this.featureSetting = featureSetting;
        this.preprocessor = new Preprocessor(featureSetting);
        MapTermCorpus mapTermCorpus = new MapTermCorpus();
        Iterator<Instance> iterator2 = dataset.iterator2();
        while (iterator2.hasNext()) {
            mapTermCorpus.addTermsFromDocument(CollectionHelper.newHashSet(this.preprocessor.apply(getTextValue(iterator2.next().getVector()))));
        }
        this.termCorpus = mapTermCorpus.getReducedCorpus(i);
        int numUniqueTerms = mapTermCorpus.getNumUniqueTerms();
        int numUniqueTerms2 = this.termCorpus.getNumUniqueTerms();
        if (numUniqueTerms2 < numUniqueTerms) {
            LOGGER.debug("Reduced term corpus from {} to {}", Integer.valueOf(numUniqueTerms), Integer.valueOf(numUniqueTerms2));
        }
        this.tfStrategy = tFStrategy;
        this.idfStrategy = iDFStrategy;
        this.alpha = i2;
    }

    private TextVectorizer(String str, FeatureSetting featureSetting, TermCorpus termCorpus, TFStrategy tFStrategy, IDFStrategy iDFStrategy, int i) {
        this.inputFeatureName = str;
        this.featureSetting = featureSetting;
        this.preprocessor = new Preprocessor(featureSetting);
        this.termCorpus = termCorpus;
        this.tfStrategy = tFStrategy;
        this.idfStrategy = iDFStrategy;
        this.alpha = i;
    }

    @Override // ws.palladian.core.dataset.AbstractDatasetFeatureVectorTransformer, ws.palladian.core.dataset.DatasetTransformer
    public FeatureInformation getFeatureInformation(FeatureInformation featureInformation) {
        return new FeatureInformationBuilder().set(this.termCorpus, ValueDefinitions.floatValue()).m85create();
    }

    @Override // ws.palladian.core.dataset.AbstractDatasetFeatureVectorTransformer
    public FeatureVector apply(FeatureVector featureVector) {
        Bag bag = new Bag(CollectionHelper.newArrayList(this.preprocessor.apply(getTextValue(featureVector))));
        InstanceBuilder instanceBuilder = new InstanceBuilder();
        Map.Entry max = bag.getMax();
        Integer num = max != null ? (Integer) max.getValue() : 0;
        for (String str : this.alpha == 0 ? new MapTermCorpus(bag, 1) : this.termCorpus) {
            instanceBuilder.set(str, new ImmutableFloatValue(this.tfStrategy.calc(Integer.valueOf(bag.count(str) + this.alpha).intValue(), bag.size(), num.intValue()) * this.idfStrategy.calc(this.termCorpus.getCount(str), this.termCorpus.getNumDocs(), num.intValue())));
        }
        return instanceBuilder.create();
    }

    private String getTextValue(FeatureVector featureVector) {
        Value value = (Value) featureVector.get(this.inputFeatureName);
        if (value instanceof NominalValue) {
            return ((NominalValue) value).getString();
        }
        if (value instanceof TextValue) {
            return ((TextValue) value).getText();
        }
        throw new IllegalArgumentException("Invalid type: " + value.getClass().getName());
    }

    public TextVectorizer copyWithDifferentStrategy(TFStrategy tFStrategy, IDFStrategy iDFStrategy, int i) {
        return new TextVectorizer(this.inputFeatureName, this.featureSetting, this.termCorpus, tFStrategy, iDFStrategy, i);
    }

    public String toString() {
        return String.format("%s [%s, %s, alpha=%s, %s]", getClass().getSimpleName(), this.tfStrategy, this.idfStrategy, Integer.valueOf(this.alpha), this.featureSetting);
    }
}
