package edu.umass.cs.mallet.projects.seg_plus_coref.coreference;

import com.wcohen.secondstring.tokens.SimpleTokenizer;
import com.wcohen.secondstring.tokens.Token;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.codehaus.groovy.tools.shell.util.ANSI;
import org.hibernate.criterion.CriteriaSpecification;
import pl.edu.icm.yadda.analysis.bibref.BibEntry;
import pl.edu.icm.yadda.imports.zentralblatt.ZentralBlattSeFieldParser;

/* loaded from: input_file:WEB-INF/lib/mallet-0.1.3.jar:edu/umass/cs/mallet/projects/seg_plus_coref/coreference/CitationNormalizer.class */
public class CitationNormalizer {
    public static final String[] STOP_WORDS = {"a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", CriteriaSpecification.ROOT_ALIAS, "to", "was", "will", "with"};
    private static Hashtable stopTable;
    private static SimpleTokenizer tokenizer;

    private void makeStopTable(String[] strArr) {
        stopTable = new Hashtable(strArr.length);
        for (int i = 0; i < strArr.length; i++) {
            stopTable.put(strArr[i], strArr[i]);
        }
    }

    public CitationNormalizer() {
        makeStopTable(STOP_WORDS);
        tokenizer = new SimpleTokenizer(true, true);
    }

    public List getTokens(String str) {
        return makeList(tokenizer.tokenize(str));
    }

    public Set getTokensAsSet(String str) {
        return new HashSet(getTokens(str));
    }

    public String norm1(String str) {
        return tokensToString(makeSubstitutions(removeStopWords(makeList(tokenizer.tokenize(str)))));
    }

    public String authorNorm(String str) {
        return str.replaceAll(" and", "").replaceAll("\\p{Punct}", ANSI.Renderer.CODE_TEXT_SEPARATOR).replaceAll("[\t\n\f\r-]", "").replaceAll(" +", ANSI.Renderer.CODE_TEXT_SEPARATOR);
    }

    public String norm(String str) {
        return norm2(norm1(str));
    }

    public String norm2(String str) {
        str.replaceAll("[ \t\n\f\r-]", "");
        return str;
    }

    public String getFourDigitString(String str) {
        List makeList = makeList(tokenizer.tokenize(str));
        for (int i = 0; i < makeList.size(); i++) {
            String str2 = (String) makeList.get(i);
            if (str2.matches("[0-9][0-9][0-9][0-9]")) {
                return str2;
            }
        }
        return "";
    }

    private List makeList(Token[] tokenArr) {
        ArrayList arrayList = new ArrayList();
        for (Token token : tokenArr) {
            arrayList.add(token.getValue());
        }
        return arrayList;
    }

    public List makeSubstitutions(List list) {
        return makeSubstitutions(list, substitutionMap());
    }

    public List makeSubstitutions(List list, Map map) {
        for (int i = 0; i < list.size(); i++) {
            String str = (String) list.get(i);
            for (String str2 : map.keySet()) {
                String str3 = (String) map.get(str2);
                if (str2.equals(str)) {
                    list.set(i, str3);
                }
            }
        }
        return list;
    }

    private String tokensToString(List list) {
        String str = "";
        for (int i = 0; i < list.size(); i++) {
            String str2 = (String) list.get(i);
            str = new StringBuffer().append(str).append(str == "" ? str2 : new StringBuffer().append(ANSI.Renderer.CODE_TEXT_SEPARATOR).append(str2).toString()).toString();
        }
        return str;
    }

    public List removeStopWords(List list) {
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < list.size(); i++) {
            String str = (String) list.get(i);
            if (stopTable.get(str) == null) {
                arrayList.add(str);
            }
        }
        return arrayList;
    }

    private Map substitutionMap() {
        HashMap hashMap = new HashMap();
        hashMap.put("proc", BibEntry.TYPE_PROCEEDINGS);
        hashMap.put("conf", "conference");
        hashMap.put("intl", "international");
        hashMap.put("int", "international");
        hashMap.put("trans", "transactions");
        hashMap.put("assoc", "associates");
        hashMap.put("jair", "journal artificial intelligence research");
        hashMap.put("nips", "advances neural information processing systems");
        hashMap.put("nrl", "naval research laboratory");
        hashMap.put("colt", "computational learning theory");
        hashMap.put("sigir", "international conference research development information retrieval");
        hashMap.put("cacm", "communications association computing machinery");
        hashMap.put("cmu", "carnegie mellon university");
        hashMap.put("cs", "computer science");
        hashMap.put("ijcai", "international joint conference artificial intelligence");
        hashMap.put("ai", "artificial intelligence");
        hashMap.put("mit", "massachusetts institute technology");
        hashMap.put("icml", "international conference machine learning");
        hashMap.put("ieee", "institute electrical electronics engineers");
        hashMap.put("aaai", "national conference american association artificial intelligence");
        hashMap.put("mlc", "international machine learning conference");
        hashMap.put("ml", "international machine learning");
        hashMap.put("acm", "association computing machinery");
        return hashMap;
    }

    public String getNumericOnly(String str) {
        ArrayList arrayList = new ArrayList();
        List makeList = makeList(tokenizer.tokenize(str));
        for (int i = 0; i < makeList.size(); i++) {
            String str2 = (String) makeList.get(i);
            if (str2.matches(ZentralBlattSeFieldParser.ANY_NUMBER_PATTERN)) {
                arrayList.add(str2);
            }
        }
        return tokensToString(arrayList);
    }

    public String getAlphaOnly(String str) {
        ArrayList arrayList = new ArrayList();
        List makeList = makeList(tokenizer.tokenize(str));
        for (int i = 0; i < makeList.size(); i++) {
            String str2 = (String) makeList.get(i);
            if (str2.matches("[a-zA-Z]+")) {
                arrayList.add(str2);
            }
        }
        return tokensToString(arrayList);
    }
}
