package ws.palladian.extraction.pos;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ws.palladian.classification.text.FeatureSettingBuilder;
import ws.palladian.classification.universal.UniversalClassifier;
import ws.palladian.classification.universal.UniversalClassifierModel;
import ws.palladian.classification.utils.ClassifierEvaluation;
import ws.palladian.core.FeatureVector;
import ws.palladian.core.Instance;
import ws.palladian.core.InstanceBuilder;
import ws.palladian.helper.ProgressMonitor;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.collection.AbstractIterator;
import ws.palladian.helper.collection.ArrayIterator;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.math.ConfusionMatrix;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:ws/palladian/extraction/pos/PalladianPosTagger.class */
public class PalladianPosTagger extends AbstractPosTagger {
    private static final Logger LOGGER = LoggerFactory.getLogger(PalladianPosTagger.class);
    private static final String TAGGER_NAME = "Palladian POS-Tagger";
    private final UniversalClassifier tagger = getTagger();
    private final UniversalClassifierModel model;

    /* loaded from: input_file:ws/palladian/extraction/pos/PalladianPosTagger$BrownCorpusIterator.class */
    private static final class BrownCorpusIterator extends AbstractIterator<Instance> {
        final ProgressMonitor progressMonitor = new ProgressMonitor();
        final Iterator<File> trainingFiles;
        Iterator<Instance> currentInstances;

        BrownCorpusIterator(String str) {
            this.trainingFiles = new ArrayIterator(FileHelper.getFiles(str));
            this.progressMonitor.startTask((String) null, r0.length);
        }

        /* JADX INFO: Access modifiers changed from: protected */
        /* renamed from: getNext, reason: merged with bridge method [inline-methods] */
        public Instance m220getNext() throws AbstractIterator.Finished {
            if (this.currentInstances != null && this.currentInstances.hasNext()) {
                return this.currentInstances.next();
            }
            if (!this.trainingFiles.hasNext()) {
                throw FINISHED;
            }
            this.progressMonitor.increment();
            this.currentInstances = createInstances(this.trainingFiles.next());
            return this.currentInstances.next();
        }

        private Iterator<Instance> createInstances(File file) {
            try {
                String[] split = FileHelper.readFileToString(file).split("\\s");
                ArrayList arrayList = new ArrayList();
                for (String str : split) {
                    String[] split2 = str.split("/");
                    String str2 = split2[0];
                    if (split2.length >= 2 && !str2.isEmpty()) {
                        String normalizeTag = AbstractPosTagger.normalizeTag(split2[1]);
                        if (!normalizeTag.isEmpty()) {
                            arrayList.add(new InstanceBuilder().add(PalladianPosTagger.extractFeatures(split2[0])).create(normalizeTag));
                        }
                    }
                }
                return arrayList.iterator();
            } catch (IOException e) {
                throw new IllegalStateException(e);
            }
        }
    }

    public PalladianPosTagger(String str) {
        try {
            this.model = (UniversalClassifierModel) FileHelper.deserialize(str);
        } catch (IOException e) {
            throw new IllegalStateException();
        }
    }

    public PalladianPosTagger(UniversalClassifierModel universalClassifierModel) {
        this.model = universalClassifierModel;
    }

    @Override // ws.palladian.extraction.pos.AbstractPosTagger
    protected List<String> getTags(List<String> list) {
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            arrayList.add(this.tagger.classify(extractFeatures(it.next()), this.model).getMostLikelyCategory());
        }
        return arrayList;
    }

    private static UniversalClassifier getTagger() {
        return new UniversalClassifier(FeatureSettingBuilder.chars(1, 7).m40create(), UniversalClassifier.ClassifierSetting.TEXT, UniversalClassifier.ClassifierSetting.BAYES);
    }

    public static UniversalClassifierModel trainModel(final String str) {
        Validate.notEmpty(str, "folderPath must not be empty", new Object[0]);
        StopWatch stopWatch = new StopWatch();
        LOGGER.info("start training the tagger");
        UniversalClassifierModel train = getTagger().train(new Iterable<Instance>() { // from class: ws.palladian.extraction.pos.PalladianPosTagger.1
            @Override // java.lang.Iterable
            public Iterator<Instance> iterator() {
                return new BrownCorpusIterator(str);
            }
        });
        LOGGER.info("finished training tagger in {}", stopWatch.getElapsedTimeString());
        return train;
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static FeatureVector extractFeatures(String str) {
        int length = str.length();
        InstanceBuilder instanceBuilder = new InstanceBuilder();
        instanceBuilder.set("startsUppercase", StringHelper.startsUppercase(str));
        instanceBuilder.set("length1", length == 1);
        instanceBuilder.set("length2", length == 2);
        instanceBuilder.set("length3", length == 3);
        instanceBuilder.set("length", String.valueOf(length));
        instanceBuilder.set("number", StringHelper.isNumberOrNumberWord(str));
        instanceBuilder.set("completelyUppercase", StringHelper.isCompletelyUppercase(str));
        instanceBuilder.set("normalizedLength", String.valueOf(str.replaceAll("[^`'\",.:;*\\(\\)]", Instance.NO_CATEGORY_DUMMY).length()));
        instanceBuilder.set("lastCharacter", str.substring(length - 1));
        instanceBuilder.set("firstCharacter", str.substring(0, 1));
        instanceBuilder.set("lastTwoCharacters", length > 1 ? str.substring(length - 2) : Instance.NO_CATEGORY_DUMMY);
        instanceBuilder.set("word", str);
        instanceBuilder.setText(str);
        return instanceBuilder.create();
    }

    public ConfusionMatrix evaluate(final String str) {
        Validate.notEmpty(str, "folderPath must not be empty", new Object[0]);
        return ClassifierEvaluation.evaluate(this.tagger, new Iterable<Instance>() { // from class: ws.palladian.extraction.pos.PalladianPosTagger.2
            @Override // java.lang.Iterable
            public Iterator<Instance> iterator() {
                return new BrownCorpusIterator(str);
            }
        }, this.model);
    }

    @Override // ws.palladian.extraction.pos.AbstractPosTagger
    public String getName() {
        return TAGGER_NAME;
    }
}
