package edu.umass.cs.mallet.projects.seg_plus_coref.ie;

import bsh.EvalError;
import edu.stanford.nlp.pipeline.CleanXmlAnnotator;
import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils;
import edu.umass.cs.mallet.base.fst.CRF4;
import edu.umass.cs.mallet.base.fst.InstanceAccuracyEvaluator;
import edu.umass.cs.mallet.base.fst.PerClassAccuracyEvaluator;
import edu.umass.cs.mallet.base.fst.TokenAccuracyEvaluator;
import edu.umass.cs.mallet.base.pipe.Input2CharSequence;
import edu.umass.cs.mallet.base.pipe.Noop;
import edu.umass.cs.mallet.base.pipe.Pipe;
import edu.umass.cs.mallet.base.pipe.SGML2TokenSequence;
import edu.umass.cs.mallet.base.pipe.SerialPipes;
import edu.umass.cs.mallet.base.pipe.Target2LabelSequence;
import edu.umass.cs.mallet.base.pipe.TokenSequence2FeatureVectorSequence;
import edu.umass.cs.mallet.base.pipe.iterator.AbstractPipeInputIterator;
import edu.umass.cs.mallet.base.pipe.iterator.LineGroupIterator;
import edu.umass.cs.mallet.base.pipe.tsf.LexiconMembership;
import edu.umass.cs.mallet.base.pipe.tsf.OffsetConjunctions;
import edu.umass.cs.mallet.base.pipe.tsf.RegexMatches;
import edu.umass.cs.mallet.base.pipe.tsf.TokenText;
import edu.umass.cs.mallet.base.types.Alphabet;
import edu.umass.cs.mallet.base.types.Instance;
import edu.umass.cs.mallet.base.types.InstanceList;
import edu.umass.cs.mallet.base.types.Sequence;
import edu.umass.cs.mallet.base.types.Token;
import edu.umass.cs.mallet.base.types.TokenSequence;
import edu.umass.cs.mallet.base.util.CharSequenceLexer;
import edu.umass.cs.mallet.base.util.CommandOption;
import edu.umass.cs.mallet.base.util.MalletLogger;
import edu.umass.cs.mallet.projects.seg_plus_coref.BaseTUICRF;
import edu.umass.cs.mallet.projects.seg_plus_coref.clustering.LineGroupIterator2;
import edu.umass.cs.mallet.projects.seg_plus_coref.clustering.SGML2FieldsPipe;
import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.Citation;
import gnu.trove.TIntArrayList;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.commons.cli.HelpFormatter;
import pl.edu.icm.cermine.bibref.model.BibEntry;

/* loaded from: input_file:WEB-INF/lib/mallet-0.1.3.jar:edu/umass/cs/mallet/projects/seg_plus_coref/ie/TUI_CorefIE.class */
public class TUI_CorefIE extends BaseTUICRF {
    private static String[] SEPARATORS = {"<NEW_HEADER>", "<NEWREFERENCE>"};
    private static final Logger logger;
    static CommandOption.File crfInputFileOption;
    static CommandOption.File inputFileOption;
    static CommandOption.Integer headOrRefOption;
    static CommandOption.Integer nBestChoice;
    static CommandOption.Boolean includeBibtexLexicons;
    static CommandOption.Boolean excludingSingletons;
    static CommandOption.Boolean useClusterFeatures;
    static CommandOption.Boolean useNegativeClusterFeatures;
    static CommandOption.Boolean useNumClusterOccurences;
    static CommandOption.Boolean useBogusClusterFeatures;
    static CommandOption.Boolean useSparseWeights;
    static CommandOption.Integer clusterFeatureMinimum;
    static CommandOption.Integer clusterSizeLimit;
    static CommandOption.Integer methodChoice;
    static CommandOption.Integer markovOrder;
    static CommandOption.Integer numRepsOption;
    static String refNoMeta;
    static String clusterNoMeta;
    static String[] FIELD_NAMES;
    static String[] startTags;
    static String[] endTags;
    static double[] tagWeight;
    static int NumFields;
    private static String separator;
    private static String CAPS;
    private static String ALPHA;
    private static String ALPHANUM;
    private static String PUNT;
    private static String bibtexLexDir;
    private static final int MIN_CLUSTER_SIZE = 3;
    static Class class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
    static Class class$edu$umass$cs$mallet$projects$seg_plus_coref$BaseTUICRF;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* renamed from: edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE$1, reason: invalid class name */
    /* loaded from: input_file:WEB-INF/lib/mallet-0.1.3.jar:edu/umass/cs/mallet/projects/seg_plus_coref/ie/TUI_CorefIE$1.class */
    public static class AnonymousClass1 {
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:WEB-INF/lib/mallet-0.1.3.jar:edu/umass/cs/mallet/projects/seg_plus_coref/ie/TUI_CorefIE$AddClusterPropertyPipe.class */
    public static class AddClusterPropertyPipe extends Pipe {
        private AddClusterPropertyPipe() {
        }

        @Override // edu.umass.cs.mallet.base.pipe.Pipe
        public Instance pipe(Instance instance) {
            instance.setProperty("CLUSTER", instance.getSource());
            return instance;
        }

        AddClusterPropertyPipe(AnonymousClass1 anonymousClass1) {
            this();
        }
    }

    /* loaded from: input_file:WEB-INF/lib/mallet-0.1.3.jar:edu/umass/cs/mallet/projects/seg_plus_coref/ie/TUI_CorefIE$AllClusterSegmentation.class */
    public static class AllClusterSegmentation {
        Map inst2segmentation = new HashMap();
        Map inst2cluster = new HashMap();

        public AllClusterSegmentation(InstanceList instanceList, Pipe pipe) {
            InstanceList instanceList2 = new InstanceList(pipe);
            instanceList2.add(new ClusterListIterator(instanceList));
            InstanceList.Iterator it = instanceList2.iterator();
            while (it.hasNext()) {
                Instance instance = (Instance) it.next();
                this.inst2segmentation.put(instance.getName(), new Segmentation((TokenSequence) instance.getData(), (Sequence) instance.getTarget()));
                this.inst2cluster.put(instance.getName(), (InstanceList) instance.getProperty("CLUSTER"));
            }
        }

        public InstanceList getCluster(Instance instance) {
            return (InstanceList) this.inst2cluster.get(instance.getName());
        }

        public Segmentation getSegmentation(Instance instance) {
            return (Segmentation) this.inst2segmentation.get(instance.getName());
        }

        public void print() {
            for (Object obj : this.inst2cluster.keySet()) {
                Segmentation segmentation = (Segmentation) this.inst2segmentation.get(obj);
                System.out.println(new StringBuffer().append("Instance ").append(obj).append("\n  ").append(segmentation).append("\n  ").append((InstanceList) this.inst2cluster.get(obj)).toString());
            }
        }
    }

    /* loaded from: input_file:WEB-INF/lib/mallet-0.1.3.jar:edu/umass/cs/mallet/projects/seg_plus_coref/ie/TUI_CorefIE$BogusClusterPipe.class */
    public static class BogusClusterPipe extends Pipe {
        private static Pattern EXCLUDE = Pattern.compile("\\p{Punct}");
        private AllClusterSegmentation segmentation;

        BogusClusterPipe(AllClusterSegmentation allClusterSegmentation) {
            this.segmentation = allClusterSegmentation;
        }

        @Override // edu.umass.cs.mallet.base.pipe.Pipe
        public Instance pipe(Instance instance) {
            TokenSequence tokenSequence = (TokenSequence) instance.getData();
            InstanceList.Iterator it = this.segmentation.getCluster(instance).iterator();
            while (it.hasNext()) {
                Instance instance2 = (Instance) it.next();
                String str = instance2.getName().equals(instance.getName()) ? "I_AM_TAGGED_AS_" : "TAGGED_AS_";
                Segmentation segmentation = this.segmentation.getSegmentation(instance2);
                for (int i = 0; i < tokenSequence.size(); i++) {
                    Token token = tokenSequence.getToken(i);
                    String text = token.getText();
                    if (!EXCLUDE.matcher(text).matches()) {
                        for (String str2 : segmentation.fieldNamesForWord(text)) {
                            String intern = new StringBuffer().append(str).append(str2).toString().intern();
                            if (token.getFeatureValue(intern) == 0.0d) {
                                token.setFeatureValue(intern, 1.0d);
                            }
                        }
                    }
                }
            }
            return instance;
        }
    }

    /* loaded from: input_file:WEB-INF/lib/mallet-0.1.3.jar:edu/umass/cs/mallet/projects/seg_plus_coref/ie/TUI_CorefIE$ClusterListIterator.class */
    public static class ClusterListIterator extends AbstractPipeInputIterator {
        private Iterator perClusterIterator;
        private Iterator withinClusterIterator;
        private InstanceList currentCluster;

        public ClusterListIterator(InstanceList instanceList) {
            this.perClusterIterator = instanceList.iterator();
            nextCluster();
        }

        @Override // edu.umass.cs.mallet.base.pipe.iterator.AbstractPipeInputIterator, edu.umass.cs.mallet.base.pipe.iterator.PipeInputIterator
        public Instance nextInstance() {
            while (!this.withinClusterIterator.hasNext()) {
                nextCluster();
            }
            Instance instance = (Instance) this.withinClusterIterator.next();
            instance.setSource(this.currentCluster);
            return instance;
        }

        private void nextCluster() {
            this.currentCluster = (InstanceList) ((Instance) this.perClusterIterator.next()).getData();
            this.withinClusterIterator = this.currentCluster.iterator();
        }

        @Override // edu.umass.cs.mallet.base.pipe.iterator.AbstractPipeInputIterator, java.util.Iterator
        public boolean hasNext() {
            return this.perClusterIterator.hasNext() || this.withinClusterIterator.hasNext();
        }
    }

    /* loaded from: input_file:WEB-INF/lib/mallet-0.1.3.jar:edu/umass/cs/mallet/projects/seg_plus_coref/ie/TUI_CorefIE$NegativeClusterFeaturePipe.class */
    public static class NegativeClusterFeaturePipe extends Pipe {
        private static Pattern EXCLUDE = Pattern.compile("\\p{Punct}");
        private AllClusterSegmentation segmentation;

        NegativeClusterFeaturePipe(AllClusterSegmentation allClusterSegmentation) {
            this.segmentation = allClusterSegmentation;
        }

        @Override // edu.umass.cs.mallet.base.pipe.Pipe
        public Instance pipe(Instance instance) {
            InstanceList cluster = this.segmentation.getCluster(instance);
            TokenSequence tokenSequence = (TokenSequence) instance.getData();
            if (cluster.size() < 3) {
                return instance;
            }
            for (int i = 0; i < tokenSequence.size(); i++) {
                Token token = tokenSequence.getToken(i);
                String text = token.getText();
                if (!EXCLUDE.matcher(text).matches()) {
                    boolean[] zArr = new boolean[TUI_CorefIE.FIELD_NAMES.length];
                    InstanceList.Iterator it = cluster.iterator();
                    while (it.hasNext()) {
                        Instance instance2 = (Instance) it.next();
                        if (!instance2.getName().equals(instance.getName())) {
                            for (int i2 : this.segmentation.getSegmentation(instance2).fieldIdsForWord(text)) {
                                zArr[i2] = true;
                            }
                        }
                    }
                    for (int i3 = 0; i3 < zArr.length; i3++) {
                        if (!zArr[i3]) {
                            String intern = new StringBuffer().append("NEVER_TAGGED_AS_").append(TUI_CorefIE.FIELD_NAMES[i3]).toString().intern();
                            if (token.getFeatureValue(intern) == 0.0d) {
                                token.setFeatureValue(intern, 1.0d);
                            }
                        }
                    }
                }
            }
            return instance;
        }
    }

    /* loaded from: input_file:WEB-INF/lib/mallet-0.1.3.jar:edu/umass/cs/mallet/projects/seg_plus_coref/ie/TUI_CorefIE$NumAppearancesInClusterPipe.class */
    public static class NumAppearancesInClusterPipe extends Pipe {
        private static Pattern EXCLUDE = Pattern.compile("\\p{Punct}");
        private AllClusterSegmentation segmentation;

        NumAppearancesInClusterPipe(AllClusterSegmentation allClusterSegmentation) {
            this.segmentation = allClusterSegmentation;
        }

        @Override // edu.umass.cs.mallet.base.pipe.Pipe
        public Instance pipe(Instance instance) {
            TokenSequence tokenSequence = (TokenSequence) instance.getData();
            InstanceList.Iterator it = this.segmentation.getCluster(instance).iterator();
            while (it.hasNext()) {
                Instance instance2 = (Instance) it.next();
                if (!instance2.getName().equals(instance.getName())) {
                    Segmentation segmentation = this.segmentation.getSegmentation(instance2);
                    for (int i = 0; i < tokenSequence.size(); i++) {
                        Token token = tokenSequence.getToken(i);
                        String text = token.getText();
                        if (!EXCLUDE.matcher(text).matches()) {
                            for (String str : segmentation.fieldNamesForWord(text)) {
                                token.setFeatureValue(new StringBuffer().append("TAGGED_AS_").append(str).toString().intern(), 1.0d);
                            }
                        }
                    }
                }
            }
            return instance;
        }
    }

    /* loaded from: input_file:WEB-INF/lib/mallet-0.1.3.jar:edu/umass/cs/mallet/projects/seg_plus_coref/ie/TUI_CorefIE$Segmentation.class */
    public static class Segmentation {
        private Sequence output;
        private TokenSequence input;
        static String background;
        static final boolean $assertionsDisabled;

        public Segmentation(TokenSequence tokenSequence, Sequence sequence) {
            this.input = tokenSequence;
            this.output = sequence;
        }

        public String[] getFieldNames() {
            return TUI_CorefIE.FIELD_NAMES;
        }

        public String[] fieldNamesForWord(String str) {
            HashSet hashSet = new HashSet();
            if (!$assertionsDisabled && this.input.size() != this.output.size()) {
                throw new AssertionError();
            }
            for (int i = 0; i < this.input.size(); i++) {
                if (this.input.getToken(i).getText().equals(str)) {
                    hashSet.add(this.output.get(i).toString());
                }
            }
            return (String[]) hashSet.toArray(new String[hashSet.size()]);
        }

        public String toString() {
            return new StringBuffer().append("SEGMENTATION\n  input:\n").append(this.input).append("\n  output:\n").append(this.output).toString();
        }

        public int[] fieldIdsForWord(String str) {
            TIntArrayList tIntArrayList = new TIntArrayList();
            String[] fieldNamesForWord = fieldNamesForWord(str);
            List asList = Arrays.asList(TUI_CorefIE.FIELD_NAMES);
            for (String str2 : fieldNamesForWord) {
                if (!str2.equals(background)) {
                    int indexOf = asList.indexOf(str2);
                    if (indexOf == -1) {
                        System.err.println(new StringBuffer().append("ERROR: Couldn't find ").append(str2).append("\n").toString());
                    }
                    tIntArrayList.add(indexOf);
                }
            }
            return tIntArrayList.toNativeArray();
        }

        static {
            Class cls;
            if (TUI_CorefIE.class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
                cls = TUI_CorefIE.class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
                TUI_CorefIE.class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls;
            } else {
                cls = TUI_CorefIE.class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
            }
            $assertionsDisabled = !cls.desiredAssertionStatus();
            background = "O";
        }
    }

    /* loaded from: input_file:WEB-INF/lib/mallet-0.1.3.jar:edu/umass/cs/mallet/projects/seg_plus_coref/ie/TUI_CorefIE$WordAppearsInAnyClusterPipe.class */
    public static class WordAppearsInAnyClusterPipe extends Pipe {
        private static Pattern EXCLUDE = Pattern.compile("\\p{Punct}");
        private AllClusterSegmentation segmentation;

        WordAppearsInAnyClusterPipe(AllClusterSegmentation allClusterSegmentation) {
            this.segmentation = allClusterSegmentation;
        }

        @Override // edu.umass.cs.mallet.base.pipe.Pipe
        public Instance pipe(Instance instance) {
            TokenSequence tokenSequence = (TokenSequence) instance.getData();
            InstanceList.Iterator it = this.segmentation.getCluster(instance).iterator();
            while (it.hasNext()) {
                Instance instance2 = (Instance) it.next();
                if (!instance2.getName().equals(instance.getName())) {
                    Segmentation segmentation = this.segmentation.getSegmentation(instance2);
                    for (int i = 0; i < tokenSequence.size(); i++) {
                        Token token = tokenSequence.getToken(i);
                        String text = token.getText();
                        if (!EXCLUDE.matcher(text).matches()) {
                            for (String str : segmentation.fieldNamesForWord(text)) {
                                String intern = new StringBuffer().append("TAGGED_AS_").append(str).toString().intern();
                                if (token.getFeatureValue(intern) == 0.0d) {
                                    token.setFeatureValue(intern, 1.0d);
                                }
                            }
                        }
                    }
                }
            }
            return instance;
        }
    }

    /* loaded from: input_file:WEB-INF/lib/mallet-0.1.3.jar:edu/umass/cs/mallet/projects/seg_plus_coref/ie/TUI_CorefIE$WordOftenAppearsAsPipe.class */
    public static class WordOftenAppearsAsPipe extends Pipe {
        private static Pattern EXCLUDE = Pattern.compile("\\p{Punct}");
        private AllClusterSegmentation segmentation;

        WordOftenAppearsAsPipe(AllClusterSegmentation allClusterSegmentation) {
            this.segmentation = allClusterSegmentation;
        }

        @Override // edu.umass.cs.mallet.base.pipe.Pipe
        public Instance pipe(Instance instance) {
            InstanceList cluster = this.segmentation.getCluster(instance);
            TokenSequence tokenSequence = (TokenSequence) instance.getData();
            if (cluster.size() < 3) {
                return instance;
            }
            for (int i = 0; i < tokenSequence.size(); i++) {
                Token token = tokenSequence.getToken(i);
                String text = token.getText();
                if (!EXCLUDE.matcher(text).matches()) {
                    int[] iArr = new int[TUI_CorefIE.FIELD_NAMES.length];
                    InstanceList.Iterator it = cluster.iterator();
                    while (it.hasNext()) {
                        Instance instance2 = (Instance) it.next();
                        if (!instance2.getName().equals(instance.getName())) {
                            for (int i2 : this.segmentation.getSegmentation(instance2).fieldIdsForWord(text)) {
                                iArr[i2] = iArr[i2] + 1;
                            }
                        }
                    }
                    for (int i3 = 0; i3 < iArr.length; i3++) {
                        if (iArr[i3] >= TUI_CorefIE.clusterFeatureMinimum.value) {
                            String intern = new StringBuffer().append("OFTEN_TAGGED_AS_").append(TUI_CorefIE.FIELD_NAMES[i3]).toString().intern();
                            if (token.getFeatureValue(intern) == 0.0d) {
                                token.setFeatureValue(intern, 1.0d);
                            }
                        }
                    }
                }
            }
            return instance;
        }
    }

    public static void main(String[] strArr) throws Exception {
        Class cls;
        Class cls2;
        CommandOption.List list = new CommandOption.List("Segmenting references based on coreference information.", new CommandOption[0]);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls;
        } else {
            cls = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        list.add(cls);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$BaseTUICRF == null) {
            cls2 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.BaseTUICRF");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$BaseTUICRF = cls2;
        } else {
            cls2 = class$edu$umass$cs$mallet$projects$seg_plus_coref$BaseTUICRF;
        }
        list.add(cls2);
        list.process(strArr);
        initOutputDirectory();
        list.logOptions(logger);
        long currentTimeMillis = System.currentTimeMillis();
        separator = SEPARATORS[headOrRefOption.value()];
        Random random = new Random(randomSeedOption.value);
        for (int i = 0; i < numRepsOption.value; i++) {
            logger.info(new StringBuffer().append("REPETITION ").append(i).toString());
            InstanceList instanceList = new InstanceList(new Alphabet(), new Alphabet());
            instanceList.add(new LineGroupIterator(new FileReader(inputFileOption.value), Pattern.compile(separator), true));
            InstanceList instanceListClusters = getInstanceListClusters(instanceList, inputFileOption.value);
            InstanceList[] split = instanceListClusters.split(random, new double[]{trainingPct.value, 1.0d - trainingPct.value});
            InstanceList instanceList2 = split[0];
            InstanceList instanceList3 = split[1];
            logger.info(new StringBuffer().append("Num train clusters = ").append(instanceList2.size()).toString());
            logger.info(new StringBuffer().append("Num test clusters = ").append(instanceList3.size()).toString());
            System.out.println("Creating allclustersegmentation");
            SerialPipes serialPipes = new SerialPipes(new Pipe[]{makeBasePipe(), makeSegmentationsPipe(new AllClusterSegmentation(instanceListClusters, makeBasePipe()), useClusterFeatures.value, useNumClusterOccurences.value, useBogusClusterFeatures.value), new TokenSequence2FeatureVectorSequence()});
            InstanceList instanceList4 = new InstanceList(serialPipes);
            instanceList4.add(new ClusterListIterator(instanceList2));
            InstanceList instanceList5 = new InstanceList(serialPipes);
            instanceList5.add(new ClusterListIterator(instanceList3));
            logger.info(new StringBuffer().append("Number of training instances = ").append(instanceList4.size()).toString());
            logger.info(new StringBuffer().append("Number of testing instances = ").append(instanceList5.size()).toString());
            CRF4 crf4 = new CRF4(serialPipes, (Pipe) null);
            crf4.setUseSparseWeights(useSparseWeights.value);
            switch (markovOrder.value) {
                case 0:
                    crf4.addStatesForLabelsConnectedAsIn(instanceList4);
                    break;
                case 1:
                    crf4.addStatesForHalfLabelsConnectedAsIn(instanceList4);
                    break;
                case 2:
                    crf4.addStatesForThreeQuarterLabelsConnectedAsIn(instanceList4);
                    break;
                default:
                    System.err.println(new StringBuffer().append("Unknown markov-order ").append(markovOrder.value).toString());
                    System.exit(1);
                    break;
            }
            InstanceList nonTrivialTesting = getNonTrivialTesting(serialPipes, instanceList3);
            TokenAccuracyEvaluator tokenAccuracyEvaluator = new TokenAccuracyEvaluator();
            tokenAccuracyEvaluator.setNumIterationsToWait(10);
            tokenAccuracyEvaluator.setNumIterationsToSkip(5);
            crf4.train(instanceList4, null, instanceList5, tokenAccuracyEvaluator);
            FieldF1Evaluator fieldF1Evaluator = new FieldF1Evaluator(FIELD_NAMES);
            fieldF1Evaluator.test(crf4, instanceList4, "Training", null);
            fieldF1Evaluator.test(crf4, instanceList5, "Testing", null);
            fieldF1Evaluator.test(crf4, nonTrivialTesting, "Clusters>=3", null);
            PerClassAccuracyEvaluator perClassAccuracyEvaluator = new PerClassAccuracyEvaluator();
            perClassAccuracyEvaluator.test(crf4, instanceList4, "Training", null);
            perClassAccuracyEvaluator.test(crf4, instanceList5, "Testing", null);
            perClassAccuracyEvaluator.test(crf4, nonTrivialTesting, "Clusters>=3", null);
            InstanceAccuracyEvaluator instanceAccuracyEvaluator = new InstanceAccuracyEvaluator();
            instanceAccuracyEvaluator.test(crf4, instanceList4, "Training", null);
            instanceAccuracyEvaluator.test(crf4, instanceList5, "Testing", null);
            instanceAccuracyEvaluator.test(crf4, nonTrivialTesting, "Clusters>=3", null);
            writeOutput(crf4, instanceList4, new StringBuffer().append("-train-").append(i).toString());
            writeOutput(crf4, instanceList5, new StringBuffer().append("-test-").append(i).toString());
            writeOutput(crf4, nonTrivialTesting, new StringBuffer().append("test-gt-3-").append(i).toString());
            writeCrf(crf4, new StringBuffer().append(HelpFormatter.DEFAULT_OPT_PREFIX).append(i).toString());
        }
        System.out.println(new StringBuffer().append("Time elapses ").append((System.currentTimeMillis() - currentTimeMillis) / 1000.0d).append(" seconds for testing.").toString());
    }

    private static InstanceList getNonTrivialTesting(Pipe pipe, InstanceList instanceList) {
        InstanceList instanceList2 = new InstanceList(new Alphabet(), new Alphabet());
        InstanceList.Iterator it = instanceList.iterator();
        while (it.hasNext()) {
            Instance instance = (Instance) it.next();
            if (((InstanceList) instance.getData()).size() > 2) {
                instanceList2.add(instance);
            }
        }
        InstanceList instanceList3 = new InstanceList(pipe);
        instanceList3.add(new ClusterListIterator(instanceList2));
        return instanceList3;
    }

    private static Pipe makeBasePipe() throws EvalError, FileNotFoundException {
        return new SerialPipes(new Pipe[]{new AddClusterPropertyPipe(null), new Input2CharSequence(), new SGML2TokenSequence(new CharSequenceLexer(CharSequenceLexer.LEX_NONWHITESPACE_CLASSES), "O"), new TokenText("W="), new RegexMatches("INITCAP", Pattern.compile(new StringBuffer().append(CAPS).append(CleanXmlAnnotator.DEFAULT_XML_TAGS).toString())), new RegexMatches("ALLDIGITS", Pattern.compile("[0-9]*")), new RegexMatches("ALLCAPS", Pattern.compile(new StringBuffer().append(CAPS).append("+").toString())), new RegexMatches("CONTAINSDIGITS", Pattern.compile(".*[0-9].*")), new RegexMatches("ALLDIGITS", Pattern.compile("[0-9]+")), new RegexMatches("PHONEORZIP", Pattern.compile("[0-9]+-[0-9]+")), new RegexMatches("CONTAINSDOTS", Pattern.compile("[^\\.]*\\..*")), new RegexMatches("CONTAINSDASH", Pattern.compile(new StringBuffer().append(ALPHANUM).append("+-").append(ALPHANUM).append("*").toString())), new RegexMatches("ACRO", Pattern.compile("[A-Z][A-Z\\.]*\\.[A-Z\\.]*")), new RegexMatches("LONELYINITIAL", Pattern.compile(new StringBuffer().append(CAPS).append("\\.").toString())), new RegexMatches("SINGLECHAR", Pattern.compile(ALPHA)), new RegexMatches("CAPLETTER", Pattern.compile(CAPS)), new RegexMatches(ATBTreeUtils.puncTag, Pattern.compile(PUNT)), new RegexMatches("URL", Pattern.compile("www\\..*|http://.*|ftp\\..*")), new RegexMatches("EMAIL", Pattern.compile("\\S+@\\S+|e-mail.*|email.*|Email.*")), new OffsetConjunctions(true, getOffsets()), includeBibtexLexicons.value ? new SerialPipes(new Pipe[]{new LexiconMembership("BIBTEX_AUTHOR", new File(bibtexLexDir, "lexicon_author"), true), new LexiconMembership("BIBTEX_DATE", new File(bibtexLexDir, "lexicon_date"), true), new LexiconMembership("NOTES", new File(bibtexLexDir, "lexicon_note"), true), new LexiconMembership("DEGREE", new File(bibtexLexDir, "lexicon_degree"), true), new LexiconMembership("AFFILIATION", new File(bibtexLexDir, "lexicon_affiliation"), true)}) : new Noop(), new Target2LabelSequence()});
    }

    private static Pipe makeSegmentationsPipe(AllClusterSegmentation allClusterSegmentation, boolean z, boolean z2, boolean z3) {
        Pipe[] pipeArr = new Pipe[5];
        pipeArr[0] = z ? new WordAppearsInAnyClusterPipe(allClusterSegmentation) : new Noop();
        pipeArr[1] = z2 ? new NumAppearancesInClusterPipe(allClusterSegmentation) : new Noop();
        pipeArr[2] = z3 ? new BogusClusterPipe(allClusterSegmentation) : new Noop();
        pipeArr[3] = clusterFeatureMinimum.wasInvoked() ? new WordOftenAppearsAsPipe(allClusterSegmentation) : new Noop();
        pipeArr[4] = useNegativeClusterFeatures.value ? new NegativeClusterFeaturePipe(allClusterSegmentation) : new Noop();
        return new SerialPipes(pipeArr);
    }

    private static void printClusterList(InstanceList instanceList) {
        InstanceList.Iterator it = instanceList.iterator();
        while (it.hasNext()) {
            Instance instance = (Instance) it.next();
            InstanceList instanceList2 = (InstanceList) instance.getData();
            System.out.println(new StringBuffer().append("\n\nCLUSTER *** ").append(instance.getName()).toString());
            InstanceList.Iterator it2 = instanceList2.iterator();
            while (it2.hasNext()) {
                Instance instance2 = (Instance) it2.next();
                String str = "<null>";
                if (instance2.getTarget() != null) {
                    str = instance2.getTarget().toString();
                }
                System.out.println(new StringBuffer().append("name: ").append(instance2.getName()).append("\ninput: ").append(instance2.getData().toString()).append("\ntarget: ").append(str).toString());
            }
        }
    }

    private static InstanceList getInstanceListClusters(InstanceList instanceList, File file) {
        InstanceList instanceList2 = new InstanceList(new SerialPipes(new Pipe[]{new Input2CharSequence(), new SGML2FieldsPipe(refNoMeta, clusterNoMeta, startTags, endTags, tagWeight)}));
        try {
            instanceList2.add(new LineGroupIterator2(new FileReader(file), Pattern.compile(separator), true));
            if (instanceList.size() != instanceList2.size()) {
                throw new UnsupportedOperationException("size not equal");
            }
            int size = instanceList.size();
            for (int i = 0; i < size; i++) {
                Instance instanceList3 = instanceList.getInstance(i);
                Instance instanceList4 = instanceList2.getInstance(i);
                instanceList3.setPropertyList(instanceList4.getPropertyList());
                instanceList3.setName(instanceList4.getName());
                instanceList.setInstance(i, instanceList3);
            }
            LinkedHashMap linkedHashMap = new LinkedHashMap();
            for (int i2 = 0; i2 < size; i2++) {
                Instance instanceList5 = instanceList.getInstance(i2);
                Object property = instanceList5.getProperty(clusterNoMeta);
                if (linkedHashMap.containsKey(property)) {
                    InstanceList instanceList6 = (InstanceList) linkedHashMap.get(property);
                    instanceList6.add(instanceList5);
                    linkedHashMap.put(property, instanceList6);
                } else {
                    InstanceList instanceList7 = new InstanceList();
                    instanceList7.add(instanceList5);
                    linkedHashMap.put(property, instanceList7);
                }
            }
            InstanceList instanceList8 = new InstanceList(null);
            for (Map.Entry entry : linkedHashMap.entrySet()) {
                instanceList8.add(new Instance((InstanceList) entry.getValue(), null, new StringBuffer().append("Cluster ").append((String) entry.getKey()).toString(), null));
            }
            return instanceList8;
        } catch (Exception e) {
            throw new IllegalArgumentException(new StringBuffer().append("Can't read file ").append(file).toString());
        }
    }

    static Class class$(String str) {
        try {
            return Class.forName(str);
        } catch (ClassNotFoundException e) {
            throw new NoClassDefFoundError().initCause(e);
        }
    }

    static {
        Class cls;
        Class cls2;
        Class cls3;
        Class cls4;
        Class cls5;
        Class cls6;
        Class cls7;
        Class cls8;
        Class cls9;
        Class cls10;
        Class cls11;
        Class cls12;
        Class cls13;
        Class cls14;
        Class cls15;
        Class cls16;
        Class cls17;
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls;
        } else {
            cls = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        logger = MalletLogger.getLogger(cls.getName());
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls2 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls2;
        } else {
            cls2 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        crfInputFileOption = new CommandOption.File(cls2, "crf-input-file", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls3 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls3;
        } else {
            cls3 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        inputFileOption = new CommandOption.File(cls3, "input-file", "FILENAME", true, null, "The name of the file containing the testing data.", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls4 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls4;
        } else {
            cls4 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        headOrRefOption = new CommandOption.Integer(cls4, "head-or-ref", "INTEGER", true, 0, "0 for header, 1 for reference", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls5 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls5;
        } else {
            cls5 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        nBestChoice = new CommandOption.Integer(cls5, "nbestchoice", "INTEGER", true, 1, "N for N-best", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls6 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls6;
        } else {
            cls6 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        includeBibtexLexicons = new CommandOption.Boolean(cls6, "include-bibtex-lexicons", "INTEGER", true, false, "Whether to use BibTeX lexicons from Fuchun.", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls7 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls7;
        } else {
            cls7 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        excludingSingletons = new CommandOption.Boolean(cls7, "exclude-singletons", "boolean", true, true, "excluding singletons.", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls8 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls8;
        } else {
            cls8 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        useClusterFeatures = new CommandOption.Boolean(cls8, "use-cluster-features", "boolean", true, true, "excluding singletons.", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls9 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls9;
        } else {
            cls9 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        useNegativeClusterFeatures = new CommandOption.Boolean(cls9, "use-negative-cluster-features", "boolean", true, false, "Whether to use features that say words AREN'T tagged in cluster.", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls10 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls10;
        } else {
            cls10 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        useNumClusterOccurences = new CommandOption.Boolean(cls10, "use-cluster-occurrences", "boolean", true, false, "Whether to use number of tagged word occurrences in cluster as features.", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls11 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls11;
        } else {
            cls11 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        useBogusClusterFeatures = new CommandOption.Boolean(cls11, "use-bogus-cluster-features", "boolean", true, false, "If true, use features from the instance's true segmentation.", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls12 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls12;
        } else {
            cls12 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        useSparseWeights = new CommandOption.Boolean(cls12, "use-sparse-weights", "boolean", true, false, "If true, use only input features that appear in training set.", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls13 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls13;
        } else {
            cls13 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        clusterFeatureMinimum = new CommandOption.Integer(cls13, "cluster-feature-minimum", "INTEGER", true, 2, "Minimum number of coreferent citations that need to agree to create a cluster feature.", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls14 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls14;
        } else {
            cls14 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        clusterSizeLimit = new CommandOption.Integer(cls14, "cluster-size-limit", "INTEGER", true, 10000, "cluster Size Limit", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls15 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls15;
        } else {
            cls15 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        methodChoice = new CommandOption.Integer(cls15, "methodchoice", "INTEGER", true, 1, "method for canonical citation creation", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls16 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls16;
        } else {
            cls16 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        markovOrder = new CommandOption.Integer(cls16, "markov-order", "INTEGER", true, 0, "0 = states for all transitions, 1 = half labels, 2 = three-quarter labels", null);
        if (class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE == null) {
            cls17 = class$("edu.umass.cs.mallet.projects.seg_plus_coref.ie.TUI_CorefIE");
            class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE = cls17;
        } else {
            cls17 = class$edu$umass$cs$mallet$projects$seg_plus_coref$ie$TUI_CorefIE;
        }
        numRepsOption = new CommandOption.Integer(cls17, "num-reps", "INTEGER", true, 5, "Number of random test-training splits to try.", null);
        refNoMeta = "reference_no=";
        clusterNoMeta = "cluster_no=";
        FIELD_NAMES = new String[]{"author", "title", "date", "publisher", BibEntry.FIELD_LOCATION, "pages", "institution", "editor", "volume", "note", "booktitle", Citation.tech, "journal"};
        startTags = new String[]{"<author>", "<title>", "<date>", "<publisher>", "<location>", "<pages>", "<institution>", "<editor>", "<volume>", "<note>", "<booktitle>", "<tech>", "<journal>"};
        endTags = new String[]{"</author>", "</title>", "</date>", "</publisher>", "</location>", "</pages>", "</institution>", "</editor>", "</volume>", "</note>", "</booktitle>", "</tech>", "</journal>"};
        tagWeight = new double[]{1.0d, 1.0d, 0.5d, 1.0d, 1.0d, 1.0d, 1.0d, 1.0d, 1.0d, 1.0d, 1.0d, 1.0d, 1.0d};
        NumFields = 10;
        CAPS = "[A-Z������]";
        ALPHA = "[A-Z������a-z�������]";
        ALPHANUM = "[A-Z������a-z�������0-9]";
        PUNT = "[,\\.;:?!()]";
        bibtexLexDir = "/usr/col/tmp1/casutton/resources/fuchun/";
    }
}
