package edu.umass.cs.mallet.share.mccallum.ner;

import edu.umass.cs.mallet.base.fst.CRF3;
import edu.umass.cs.mallet.base.fst.MultiSegmentationEvaluator;
import edu.umass.cs.mallet.base.fst.Transducer;
import edu.umass.cs.mallet.base.pipe.Noop;
import edu.umass.cs.mallet.base.pipe.Pipe;
import edu.umass.cs.mallet.base.pipe.PrintTokenSequenceFeatures;
import edu.umass.cs.mallet.base.pipe.SerialPipes;
import edu.umass.cs.mallet.base.pipe.TokenSequence2FeatureVectorSequence;
import edu.umass.cs.mallet.base.pipe.iterator.LineGroupIterator;
import edu.umass.cs.mallet.base.pipe.tsf.FeaturesInWindow;
import edu.umass.cs.mallet.base.pipe.tsf.FeaturesOfFirstMention;
import edu.umass.cs.mallet.base.pipe.tsf.OffsetConjunctions;
import edu.umass.cs.mallet.base.pipe.tsf.RegexMatches;
import edu.umass.cs.mallet.base.pipe.tsf.TokenText;
import edu.umass.cs.mallet.base.pipe.tsf.TokenTextCharNGrams;
import edu.umass.cs.mallet.base.pipe.tsf.TrieLexiconMembership;
import edu.umass.cs.mallet.base.types.Alphabet;
import edu.umass.cs.mallet.base.types.InstanceList;
import edu.umass.cs.mallet.base.util.CommandOption;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.util.Random;
import java.util.regex.Pattern;
import org.codehaus.groovy.tools.shell.util.ANSI;
import org.hsqldb.Tokens;

/* loaded from: input_file:WEB-INF/lib/mallet-0.1.3.jar:edu/umass/cs/mallet/share/mccallum/ner/TUI.class */
public class TUI {
    static CommandOption.Double gaussianVarianceOption;
    static CommandOption.Double hyperbolicSlopeOption;
    static CommandOption.Double hyperbolicSharpnessOption;
    static CommandOption.File crfInputFileOption;
    static CommandOption.Integer randomSeedOption;
    static CommandOption.Integer labelGramOption;
    static CommandOption.Integer wordWindowFeatureOption;
    static CommandOption.Boolean useTestbOption;
    static CommandOption.Boolean useHyperbolicPriorOption;
    static CommandOption.Boolean useFeatureInductionOption;
    static CommandOption.Boolean clusterFeatureInductionOption;
    static CommandOption.Boolean useFirstMentionFeatureOption;
    static CommandOption.Boolean useDocHeaderFeatureOption;
    static CommandOption.Boolean includeConllLexiconsOption;
    static CommandOption.Boolean charNGramsOption;
    static CommandOption.String offsetsOption;
    static CommandOption.String capOffsetsOption;
    static CommandOption.String viterbiFilePrefixOption;
    static final CommandOption.List commandOptions;
    int numEvaluations = 0;
    static int iterationsBetweenEvals;
    static boolean doingFeatureInduction;
    static boolean doingClusteredFeatureInduction;
    private static String CAPS;
    private static String LOW;
    private static String CAPSNUM;
    private static String ALPHA;
    private static String ALPHANUM;
    private static String PUNT;
    private static String QUOTE;
    static Class class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
    static final boolean $assertionsDisabled;

    public static void main(String[] strArr) throws FileNotFoundException, Exception {
        commandOptions.process(strArr);
        String property = System.getProperty("HOME");
        String stringBuffer = new StringBuffer().append(property).append("/research/data/resources/").toString();
        int[][] iArr = (int[][]) CommandOption.getInterpreter().eval(new StringBuffer().append("new int[][] ").append(offsetsOption.value.replace('[', '{').replace(']', '}')).toString());
        String replace = capOffsetsOption.value.replace('[', '{').replace(']', '}');
        int[][] iArr2 = (int[][]) null;
        if (replace.length() > 0) {
            iArr2 = (int[][]) CommandOption.getInterpreter().eval(new StringBuffer().append("new int[][] ").append(replace).toString());
        }
        SerialPipes serialPipes = includeConllLexiconsOption.value ? new SerialPipes(new Pipe[]{new TrieLexiconMembership(new File(new StringBuffer().append(stringBuffer).append("conll/CONLLTWOPER").toString())), new TrieLexiconMembership(new File(new StringBuffer().append(stringBuffer).append("conll/CONLLTWOLOC").toString())), new TrieLexiconMembership(new File(new StringBuffer().append(stringBuffer).append("conll/CONLLTWOORG").toString())), new TrieLexiconMembership(new File(new StringBuffer().append(stringBuffer).append("conll/CONLLTWOMISC").toString()))}) : null;
        Pipe[] pipeArr = new Pipe[27];
        pipeArr[0] = new ConllNer2003Sentence2TokenSequence();
        pipeArr[1] = new RegexMatches("INITCAP", Pattern.compile(new StringBuffer().append(CAPS).append(".*").toString()));
        pipeArr[2] = new RegexMatches("CAPITALIZED", Pattern.compile(new StringBuffer().append(CAPS).append(LOW).append("*").toString()));
        pipeArr[3] = new RegexMatches("ALLCAPS", Pattern.compile(new StringBuffer().append(CAPS).append("+").toString()));
        pipeArr[4] = new RegexMatches("MIXEDCAPS", Pattern.compile("[A-Z][a-z]+[A-Z][A-Za-z]*"));
        pipeArr[5] = new RegexMatches("CONTAINSDIGITS", Pattern.compile(".*[0-9].*"));
        pipeArr[6] = new RegexMatches("ALLDIGITS", Pattern.compile("[0-9]+"));
        pipeArr[7] = new RegexMatches("NUMERICAL", Pattern.compile("[-0-9]+[\\.,]+[0-9\\.,]+"));
        pipeArr[8] = new RegexMatches("MULTIDOTS", Pattern.compile("\\.\\.+"));
        pipeArr[9] = new RegexMatches("ENDSINDOT", Pattern.compile("[^\\.]+.*\\."));
        pipeArr[10] = new RegexMatches("CONTAINSDASH", Pattern.compile(new StringBuffer().append(ALPHANUM).append("+-").append(ALPHANUM).append("*").toString()));
        pipeArr[11] = new RegexMatches("ACRO", Pattern.compile("[A-Z][A-Z\\.]*\\.[A-Z\\.]*"));
        pipeArr[12] = new RegexMatches("LONELYINITIAL", Pattern.compile(new StringBuffer().append(CAPS).append("\\.").toString()));
        pipeArr[13] = new RegexMatches("SINGLECHAR", Pattern.compile(ALPHA));
        pipeArr[14] = new RegexMatches("CAPLETTER", Pattern.compile("[A-Z]"));
        pipeArr[15] = new RegexMatches("PUNC", Pattern.compile(PUNT));
        pipeArr[16] = new RegexMatches("QUOTE", Pattern.compile(QUOTE));
        pipeArr[17] = includeConllLexiconsOption.value ? serialPipes : new Noop();
        pipeArr[18] = new TokenText("W=");
        pipeArr[19] = new OffsetConjunctions(iArr);
        pipeArr[20] = iArr2 != null ? new OffsetConjunctions(iArr2) : new Noop();
        pipeArr[21] = !useFirstMentionFeatureOption.value ? new Noop() : new FeaturesOfFirstMention("FIRSTMENTION=", Pattern.compile(new StringBuffer().append(CAPS).append(".*").toString()), Pattern.compile("W=[^@&]+"), false);
        pipeArr[22] = !useDocHeaderFeatureOption.value ? new Noop() : new TokenSequenceDocHeader();
        pipeArr[23] = wordWindowFeatureOption.value > 0 ? new FeaturesInWindow("WINDOW=", -wordWindowFeatureOption.value, wordWindowFeatureOption.value, Pattern.compile("WORD=.*"), true) : new Noop();
        pipeArr[24] = charNGramsOption.value ? new TokenTextCharNGrams("CHARNGRAM=", new int[]{2, 3, 4}) : new Noop();
        pipeArr[25] = new PrintTokenSequenceFeatures();
        pipeArr[26] = new TokenSequence2FeatureVectorSequence(true, true);
        SerialPipes serialPipes2 = new SerialPipes(pipeArr);
        String[] strArr2 = useTestbOption.value ? new String[]{new StringBuffer().append(property).append("/research/data/ie/ner2003/eng.train").toString(), new StringBuffer().append(property).append("/research/data/ie/ner2003/eng.testb").toString()} : new String[]{new StringBuffer().append(property).append("/research/data/ie/ner2003/eng.train").toString(), new StringBuffer().append(property).append("/research/data/ie/ner2003/eng.testa").toString()};
        InstanceList instanceList = new InstanceList(serialPipes2);
        instanceList.add(new LineGroupIterator(new FileReader(new File(strArr2[0])), Pattern.compile("^.DOCSTART. .X. .X. .$"), true));
        System.out.println(new StringBuffer().append("Read ").append(instanceList.size()).append(" training instances").toString());
        InstanceList instanceList2 = null;
        if (strArr2.length > 1) {
            instanceList2 = new InstanceList(serialPipes2);
            instanceList2.add(new LineGroupIterator(new FileReader(new File(strArr2[1])), Pattern.compile("^.DOCSTART. .X. .X. .$"), true));
        }
        if (instanceList2 == null) {
            Random random = new Random(1L);
            InstanceList[] split = instanceList.split(random, new double[]{0.2d, 0.1d, 0.7d});
            instanceList = split[0];
            instanceList2 = instanceList2 != null ? instanceList2.split(random, new double[]{0.5d, 0.5d})[0] : split[1];
            if (!$assertionsDisabled && instanceList2 == null) {
                throw new AssertionError();
            }
        }
        Alphabet targetAlphabet = serialPipes2.getTargetAlphabet();
        System.out.print("State labels:");
        for (int i = 0; i < targetAlphabet.size(); i++) {
            System.out.print(new StringBuffer().append(ANSI.Renderer.CODE_TEXT_SEPARATOR).append(targetAlphabet.lookupObject(i)).toString());
        }
        System.out.println("");
        System.out.println(new StringBuffer().append("Number of features = ").append(serialPipes2.getDataAlphabet().size()).toString());
        CRF3 crf3 = new CRF3(serialPipes2, (Pipe) null);
        if (labelGramOption.value == 1) {
            crf3.addStatesForLabelsConnectedAsIn(instanceList);
        } else {
            if (labelGramOption.value != 2) {
                throw new IllegalStateException(new StringBuffer().append("label-gram must be 1, 2, or 3, not ").append(labelGramOption.value).toString());
            }
            crf3.addStatesForBiLabelsConnectedAsIn(instanceList);
        }
        if (useHyperbolicPriorOption.value) {
            crf3.setUseHyperbolicPrior(true);
            crf3.setHyperbolicPriorSlope(hyperbolicSlopeOption.value);
            crf3.setHyperbolicPriorSharpness(hyperbolicSharpnessOption.value);
        } else {
            crf3.setGaussianPriorVariance(gaussianVarianceOption.value);
        }
        for (int i2 = 0; i2 < crf3.numStates(); i2++) {
            Transducer.State state = crf3.getState(i2);
            if (state.getName().charAt(0) == 'I') {
                state.setInitialCost(Double.POSITIVE_INFINITY);
            }
        }
        System.out.println(new StringBuffer().append("Training on ").append(instanceList.size()).append(" training instances, ").append(instanceList2.size()).append(" testing instances...").toString());
        MultiSegmentationEvaluator multiSegmentationEvaluator = new MultiSegmentationEvaluator(new String[]{"B-PER", "B-LOC", "B-ORG", "B-MISC"}, new String[]{"I-PER", "I-LOC", "I-ORG", "I-MISC"});
        multiSegmentationEvaluator.setViterbiOutputFilePrefix(viterbiFilePrefixOption.value);
        if (!useFeatureInductionOption.value) {
            crf3.train(instanceList, null, instanceList2, multiSegmentationEvaluator, 99999, 10, new double[]{0.1d, 0.2d, 0.5d, 0.7d, 0.9d});
        } else if (clusterFeatureInductionOption.value) {
            crf3.trainWithFeatureInduction(instanceList, null, instanceList2, multiSegmentationEvaluator, 99999, 10, 99, 200, 0.5d, true, new double[]{0.1d, 0.2d, 0.5d, 0.7d});
        } else {
            crf3.trainWithFeatureInduction(instanceList, null, instanceList2, multiSegmentationEvaluator, 99999, 10, 99, 1000, 0.5d, false, new double[]{0.1d, 0.2d, 0.5d, 0.7d});
        }
    }

    static Class class$(String str) {
        try {
            return Class.forName(str);
        } catch (ClassNotFoundException e) {
            throw new NoClassDefFoundError().initCause(e);
        }
    }

    static {
        Class cls;
        Class cls2;
        Class cls3;
        Class cls4;
        Class cls5;
        Class cls6;
        Class cls7;
        Class cls8;
        Class cls9;
        Class cls10;
        Class cls11;
        Class cls12;
        Class cls13;
        Class cls14;
        Class cls15;
        Class cls16;
        Class cls17;
        Class cls18;
        Class cls19;
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls;
        } else {
            cls = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        $assertionsDisabled = !cls.desiredAssertionStatus();
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls2 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls2;
        } else {
            cls2 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        gaussianVarianceOption = new CommandOption.Double(cls2, "gaussian-variance", Tokens.T_DECIMAL, true, 10.0d, "The gaussian prior variance used for training.", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls3 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls3;
        } else {
            cls3 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        hyperbolicSlopeOption = new CommandOption.Double(cls3, "hyperbolic-slope", Tokens.T_DECIMAL, true, 0.2d, "The hyperbolic prior slope used for training.", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls4 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls4;
        } else {
            cls4 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        hyperbolicSharpnessOption = new CommandOption.Double(cls4, "hyperbolic-sharpness", Tokens.T_DECIMAL, true, 10.0d, "The hyperbolic prior sharpness used for training.", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls5 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls5;
        } else {
            cls5 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        crfInputFileOption = new CommandOption.File(cls5, "crf-input-file", "FILENAME", true, null, "The name of the file to write the CRF after training.", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls6 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls6;
        } else {
            cls6 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        randomSeedOption = new CommandOption.Integer(cls6, "random-seed", Tokens.T_INTEGER, true, 0, "The random seed for randomly selecting a proportion of the instance list for training", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls7 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls7;
        } else {
            cls7 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        labelGramOption = new CommandOption.Integer(cls7, "label-gram", Tokens.T_INTEGER, true, 1, "Markov order of labels: 1, 2, 3", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls8 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls8;
        } else {
            cls8 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        wordWindowFeatureOption = new CommandOption.Integer(cls8, "word-window-size", Tokens.T_INTEGER, true, 0, "Size of window of words as features: 0=none, 10, 20...", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls9 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls9;
        } else {
            cls9 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        useTestbOption = new CommandOption.Boolean(cls9, "use-testb", "true|false", true, false, "Use testb, final test set", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls10 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls10;
        } else {
            cls10 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        useHyperbolicPriorOption = new CommandOption.Boolean(cls10, "use-hyperbolic-prior", "true|false", true, false, "Use hyperbolic prior", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls11 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls11;
        } else {
            cls11 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        useFeatureInductionOption = new CommandOption.Boolean(cls11, "use-feature-induction", "true|false", true, false, "Not use or use feature induction", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls12 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls12;
        } else {
            cls12 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        clusterFeatureInductionOption = new CommandOption.Boolean(cls12, "cluster-feature-induction", "true|false", true, false, "Cluster in feature induction", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls13 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls13;
        } else {
            cls13 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        useFirstMentionFeatureOption = new CommandOption.Boolean(cls13, "use-firstmention-feature", "true|false", true, false, "Don't use first-mention feature", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls14 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls14;
        } else {
            cls14 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        useDocHeaderFeatureOption = new CommandOption.Boolean(cls14, "use-docheader-feature", "true|false", true, false, "", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls15 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls15;
        } else {
            cls15 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        includeConllLexiconsOption = new CommandOption.Boolean(cls15, "include-conll-lexicons", "true|false", true, false, "", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls16 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls16;
        } else {
            cls16 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        charNGramsOption = new CommandOption.Boolean(cls16, "char-ngrams", "true|false", true, false, "", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls17 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls17;
        } else {
            cls17 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        offsetsOption = new CommandOption.String(cls17, "offsets", "e.g. [[0,0],[1]]", true, "[[-2],[-1],[1],[2]]", "Offset conjunctions", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls18 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls18;
        } else {
            cls18 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        capOffsetsOption = new CommandOption.String(cls18, "cap-offsets", "e.g. [[0,0],[0,1]]", true, "", "Offset conjunctions applied to features that are [A-Z]*", null);
        if (class$edu$umass$cs$mallet$share$mccallum$ner$TUI == null) {
            cls19 = class$("edu.umass.cs.mallet.share.mccallum.ner.TUI");
            class$edu$umass$cs$mallet$share$mccallum$ner$TUI = cls19;
        } else {
            cls19 = class$edu$umass$cs$mallet$share$mccallum$ner$TUI;
        }
        viterbiFilePrefixOption = new CommandOption.String(cls19, "viterbi-file", "FILE", true, "TUI", "Filename in which to store most recent Viterbi output", null);
        commandOptions = new CommandOption.List("Training, testing and running a Chinese word segmenter.", new CommandOption[]{gaussianVarianceOption, hyperbolicSlopeOption, hyperbolicSharpnessOption, randomSeedOption, labelGramOption, wordWindowFeatureOption, useHyperbolicPriorOption, useFeatureInductionOption, clusterFeatureInductionOption, useFirstMentionFeatureOption, useDocHeaderFeatureOption, includeConllLexiconsOption, offsetsOption, capOffsetsOption, viterbiFilePrefixOption, useTestbOption});
        iterationsBetweenEvals = 16;
        doingFeatureInduction = true;
        doingClusteredFeatureInduction = false;
        CAPS = "[A-Z��������������]";
        LOW = "[a-z��������������]";
        CAPSNUM = "[A-Z��������������0-9]";
        ALPHA = "[A-Z��������������a-z��������������]";
        ALPHANUM = "[A-Z��������������a-z��������������0-9]";
        PUNT = "[,\\.;:?!()]";
        QUOTE = "[\"`']";
    }
}
