package edu.umass.cs.mallet.share.weili.ner.enron;

import edu.umass.cs.mallet.base.fst.CRF3;
import edu.umass.cs.mallet.base.fst.MultiSegmentationEvaluator;
import edu.umass.cs.mallet.base.fst.Transducer;
import edu.umass.cs.mallet.base.pipe.Pipe;
import edu.umass.cs.mallet.base.pipe.PrintTokenSequenceFeatures;
import edu.umass.cs.mallet.base.pipe.SerialPipes;
import edu.umass.cs.mallet.base.pipe.TokenSequence2FeatureVectorSequence;
import edu.umass.cs.mallet.base.pipe.iterator.FileIterator;
import edu.umass.cs.mallet.base.pipe.tsf.LexiconMembership;
import edu.umass.cs.mallet.base.pipe.tsf.OffsetConjunctions;
import edu.umass.cs.mallet.base.pipe.tsf.RegexMatches;
import edu.umass.cs.mallet.base.pipe.tsf.TrieLexiconMembership;
import edu.umass.cs.mallet.base.types.Alphabet;
import edu.umass.cs.mallet.base.types.InstanceList;
import edu.umass.cs.mallet.share.upenn.ner.NEPipes;
import java.io.File;
import java.io.IOException;
import java.util.Random;
import java.util.regex.Pattern;
import org.codehaus.groovy.tools.shell.util.ANSI;
import org.postgresql.jdbc2.EscapedFunctions;

/* loaded from: input_file:WEB-INF/lib/mallet-0.1.3.jar:edu/umass/cs/mallet/share/weili/ner/enron/TUI.class */
public class TUI {
    private static String CAPS = "[A-Z��������������]";
    private static String LOW = "[a-z��������������]";
    private static String CAPSNUM = "[A-Z��������������0-9]";
    private static String ALPHA = "[A-Z��������������a-z��������������]";
    private static String ALPHANUM = "[A-Z��������������a-z��������������0-9]";
    private static String PUNT = "[,\\.;:?!()]";
    private static String QUOTE = "[\"`']";

    /* JADX WARN: Type inference failed for: r7v82, types: [int[], int[][]] */
    public static void main(String[] strArr) throws IOException {
        SerialPipes serialPipes = new SerialPipes(new Pipe[]{new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("conll/CONLLTWOPER").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("conll/CONLLTWOLOC").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("conll/CONLLTWOORG").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("conll/CONLLTWOMISC").toString()))});
        SerialPipes serialPipes2 = new SerialPipes(new Pipe[]{new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGSOCCER").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGGOVT").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGNGO").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGMILITARY").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGCOMPANY").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGBANK").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGTRADE").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGNEWS").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGOPERATINGSYSTEM").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGPOLITICALPARTY").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGTRAVEL").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGBASEBALLTEAMAUGF").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGCARMODEL").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGCARCOMPANY").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGENGLISHCOUNTYAUG").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGUNIVERSITY").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/MISCNATIONALITYAUGF").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/MISCDISEASEAUG").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/MISCTIME").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/MISCAWARDS").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/MISCMOVIESAUGF").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/MISCPOLITICALPARTY").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/MISCRELIGION").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/MISCGOVT").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/MISCWAR").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/MISCCURRENCY").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/LOC").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/PERFL").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/MISCF").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("googlesets/ORGFRAWEDITEDSORTED").toString()))});
        SerialPipes serialPipes3 = new SerialPipes(new Pipe[]{new LexiconMembership("FIRSTHIGHEST", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("personname/ssdi.prfirsthighest").toString()), true), new LexiconMembership("FIRSTHIGH", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("personname/ssdi.prfirsthigh").toString()), true), new LexiconMembership("FIRSTMED", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("personname/ssdi.prfirstmed").toString()), true), new LexiconMembership("FIRSTLOW", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("personname/ssdi.prfirstlow").toString()), true), new LexiconMembership("LASTHIGHEST", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("personname/ssdi.prlasthighest").toString()), true), new LexiconMembership("LASTHIGH", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("personname/ssdi.prlasthigh").toString()), true), new LexiconMembership("LASTMED", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("personname/ssdi.prlastmed").toString()), true), new LexiconMembership("LASTLOW", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("personname/ssdi.prlastlow").toString()), true), new LexiconMembership("HONORIFIC", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("personname/honorifics").toString()), true), new LexiconMembership("NAMESUFFIX", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("personname/namesuffixes").toString()), true), new LexiconMembership("NAMEPARTICLE", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("personname/name-particles").toString()), true), new LexiconMembership(EscapedFunctions.SQL_TSI_DAY, new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("days").toString()), true), new LexiconMembership(EscapedFunctions.SQL_TSI_MONTH, new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("months").toString()), true), new LexiconMembership("PLACESUFFIX", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("place-suffixes").toString()), true), new TrieLexiconMembership("COUNTRY", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("countries").toString()), true), new TrieLexiconMembership("COUNTRYCAPITAL", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("country-capitals").toString()), true), new TrieLexiconMembership("USSTATE", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("US-states").toString()), true), new TrieLexiconMembership("COMPANYNAME", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("company-names").toString()), true), new TrieLexiconMembership("COMPANYSUFFIX", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("company-suffixes").toString()), true), new TrieLexiconMembership("CONTINENT", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("continents").toString()), true), new LexiconMembership("STOPWORD", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("stopwords").toString()), true), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("biz.yahoo/COMPANYNAME.ABBREV").toString())), new TrieLexiconMembership(new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/conllDict/").append("utexas/UNIVERSITIES").toString()))});
        SerialPipes serialPipes4 = new SerialPipes(new Pipe[]{new TrieLexiconMembership("IDF_DES", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/idfDict/").append("designator.data").toString()), true), new TrieLexiconMembership("IDF_FIR", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/idfDict/").append("firstnames.data").toString()), true), new TrieLexiconMembership("IDF_LOC", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/idfDict/").append("locations.data").toString()), true), new TrieLexiconMembership("IDF_NAT", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/idfDict/").append("nations.data").toString()), true), new TrieLexiconMembership("IDF_ABB", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/idfDict/").append("non-final-abbrevs.data").toString()), true), new TrieLexiconMembership("IDF_ORG", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/idfDict/").append("organization.data").toString()), true), new TrieLexiconMembership("IDF_PER", new File(new StringBuffer().append("/usr/col/tmp1/weili/Resource/idfDict/").append("person.data").toString()), true)});
        new SerialPipes(new Pipe[]{new RegexMatches("INITCAP", Pattern.compile(new StringBuffer().append(CAPS).append(".*").toString())), new RegexMatches("CAPITALIZED", Pattern.compile(new StringBuffer().append(CAPS).append(LOW).append("*").toString())), new RegexMatches("ALLCAPS", Pattern.compile(new StringBuffer().append(CAPS).append("+").toString())), new RegexMatches("MIXEDCAPS", Pattern.compile("[A-Z][a-z]+[A-Z][A-Za-z]*")), new RegexMatches("CONTAINSDIGITS", Pattern.compile(".*[0-9].*")), new RegexMatches("ALLDIGITS", Pattern.compile("[0-9]+")), new RegexMatches("NUMERICAL", Pattern.compile("[-0-9]+[\\.,]+[0-9\\.,]+")), new RegexMatches("MULTIDOTS", Pattern.compile("\\.\\.+")), new RegexMatches("ENDSINDOT", Pattern.compile("[^\\.]+.*\\.")), new RegexMatches("CONTAINSDASH", Pattern.compile(new StringBuffer().append(ALPHANUM).append("+-").append(ALPHANUM).append("*").toString())), new RegexMatches("ACRO", Pattern.compile("[A-Z][A-Z\\.]*\\.[A-Z\\.]*")), new RegexMatches("LONELYINITIAL", Pattern.compile(new StringBuffer().append(CAPS).append("\\.").toString())), new RegexMatches("SINGLECHAR", Pattern.compile(ALPHA)), new RegexMatches("CAPLETTER", Pattern.compile("[A-Z]")), new RegexMatches("PUNC", Pattern.compile(PUNT)), new RegexMatches("QUOTE", Pattern.compile(QUOTE))});
        SerialPipes serialPipes5 = new SerialPipes(new Pipe[]{new EnronMessage2TokenSequence(), new NEPipes(new File("/usr/col/tmp1/weili/Resource/places")), serialPipes, serialPipes2, serialPipes3, serialPipes4, new OffsetConjunctions(new int[]{new int[]{-1}, new int[]{1}}), new PrintTokenSequenceFeatures(), new TokenSequence2FeatureVectorSequence(true, true)});
        InstanceList instanceList = new InstanceList(serialPipes5);
        instanceList.add(new FileIterator("/usr/can/tmp3/weili/NER/Enron/data", FileIterator.STARTING_DIRECTORIES));
        InstanceList[] split = instanceList.split(new Random(1L), new double[]{0.8d, 0.2d});
        Alphabet targetAlphabet = serialPipes5.getTargetAlphabet();
        System.out.print("State labels:");
        for (int i = 0; i < targetAlphabet.size(); i++) {
            System.out.print(new StringBuffer().append(ANSI.Renderer.CODE_TEXT_SEPARATOR).append(targetAlphabet.lookupObject(i)).toString());
        }
        System.out.println("");
        System.out.println(new StringBuffer().append("Number of features = ").append(serialPipes5.getDataAlphabet().size()).toString());
        CRF3 crf3 = new CRF3(serialPipes5, (Pipe) null);
        crf3.addStatesForThreeQuarterLabelsConnectedAsIn(split[0]);
        crf3.setGaussianPriorVariance(100.0d);
        for (int i2 = 0; i2 < crf3.numStates(); i2++) {
            crf3.getState(i2).setInitialCost(Double.POSITIVE_INFINITY);
        }
        crf3.getState("O").setInitialCost(Transducer.ZERO_COST);
        System.out.println(new StringBuffer().append("Training on ").append(split[0].size()).append(" training instances.").toString());
        MultiSegmentationEvaluator multiSegmentationEvaluator = new MultiSegmentationEvaluator(new String[]{"B-DATE", "B-TIME", "B-LOCATION", "B-PERSON", "B-ORGANIZATION", "B-ACRONYM", "B-PHONE", "B-MONEY", "B-PERCENT"}, new String[]{"I-DATE", "I-TIME", "I-LOCATION", "I-PERSON", "I-ORGANIZATION", "I-ACRONYM", "I-PHONE", "I-MONEY", "I-PERCENT"});
        multiSegmentationEvaluator.setViterbiOutputFilePrefix(strArr[2]);
        if (strArr[0].equals("FeatureInduction")) {
            crf3.trainWithFeatureInduction(split[0], null, split[1], multiSegmentationEvaluator, 99999, 10, 60, 500, 0.5d, false, new double[]{0.1d, 0.2d, 0.5d, 0.7d});
        } else if (strArr[0].equals("NoFeatureInduction")) {
            crf3.train(split[0], null, split[1], multiSegmentationEvaluator, 99999, 10, new double[]{0.1d, 0.2d, 0.5d, 0.7d, 0.9d});
        } else {
            System.err.println("Feature induction or not? Give me a choice.");
            System.exit(1);
        }
        crf3.write(new File(strArr[1]));
    }
}
