package uk.ac.cam.ch.wwmm.acpgeo;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import uk.ac.cam.ch.wwmm.chemicaltagger.ChemistryPOSTagger;
import uk.ac.cam.ch.wwmm.chemicaltagger.OpenNLPTagger;
import uk.ac.cam.ch.wwmm.chemicaltagger.OscarTagger;
import uk.ac.cam.ch.wwmm.chemicaltagger.POSContainer;
import uk.ac.cam.ch.wwmm.oscar.Oscar;
import uk.ac.cam.ch.wwmm.oscar.document.Token;

/* loaded from: input_file:uk/ac/cam/ch/wwmm/acpgeo/ACPTagger.class */
public class ACPTagger {
    private static Pattern PRESERVE_CITATION_PATTERNAll = Pattern.compile("(([.][a-z][.]\\s+)|([^.]\\s+)|([^ A-Za-z]))((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)\\s+)?(\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*\\s+)(((et\\s+al[.])|(and))\\s*((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)\\s+)?\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*\\s*){0,1})?(([(]\\d{4,4}[a-z]?(([,;]|(\\s*and))\\s*(\\d{4,4})?[a-z]?)*[)])|(,\\s*\\d{4,4}[a-z]?(([,;]|\\s*(and))\\s*(\\d{4,4})[a-z]?)*)))", 128);
    private static Pattern REMOVE_NBS = Pattern.compile("\\S+[ ]");
    private static String PALAEO_DICTIONARY = "dictionaries/palaeoTimesGlossary.txt";
    public ChemistryPOSTagger posTagger;
    private HashMap<String, String> palaeoGlossaryMap;
    private ACPRegexTagger acpRegexTagger;

    /* loaded from: input_file:uk/ac/cam/ch/wwmm/acpgeo/ACPTagger$TaggerHolder.class */
    private static class TaggerHolder {
        private static final ACPTagger INSTANCE = new ACPTagger();

        private TaggerHolder() {
        }
    }

    private ACPTagger() {
        this.acpRegexTagger = new ACPRegexTagger();
        this.palaeoGlossaryMap = new DictionaryLoader().loadDictionary(PALAEO_DICTIONARY, false);
        ArrayList arrayList = new ArrayList();
        arrayList.add(this.acpRegexTagger);
        arrayList.add(new OscarTagger(new Oscar()));
        arrayList.add(OpenNLPTagger.getInstance());
        this.posTagger = new ChemistryPOSTagger(new ACPTokeniser(), arrayList);
    }

    public static ACPTagger getInstance() {
        return TaggerHolder.INSTANCE;
    }

    public String WordPhrases(CharSequence charSequence) {
        if (REMOVE_NBS.matcher(charSequence).find()) {
            charSequence.toString().replaceAll(" +", " ");
        }
        Matcher matcher = PRESERVE_CITATION_PATTERNAll.matcher(charSequence);
        StringBuffer stringBuffer = new StringBuffer(charSequence.length());
        while (matcher.find()) {
            System.out.println("found CITATION PHRASE" + matcher.group(0));
            System.out.println("found CITATION PHRASE GROUP 5" + matcher.group(5));
            matcher.appendReplacement(stringBuffer, Matcher.quoteReplacement(matcher.group(5).replaceAll("\\s+", " ")));
        }
        matcher.appendTail(stringBuffer);
        return stringBuffer.toString();
    }

    public POSContainer runTaggers(String str) {
        POSContainer runTaggers = this.posTagger.runTaggers(str, false);
        int i = 0;
        for (Token token : runTaggers.getWordTokenList()) {
            String str2 = (String) runTaggers.getCombinedTagsList().get(i);
            String surface = (!token.getSurface().contains("-") || token.getSurface().startsWith("-") || token.getSurface().endsWith("-")) ? (!token.getSurface().contains("/") || token.getSurface().startsWith("/") || token.getSurface().endsWith("/")) ? token.getSurface() : token.getSurface().split("/")[1] : token.getSurface().split("-")[1];
            if (this.palaeoGlossaryMap.containsKey(surface)) {
                if (str2.contains("-") && !str2.startsWith("-")) {
                    str2 = str2.split("-")[0];
                }
                if (str2.contains("NNPS") || str2.contains("JJ") || str2.contains("NN") || str2.contains("DT") || str2.contains("OSCAR")) {
                    str2 = "NNP";
                }
                if (!str2.contains("VB")) {
                    runTaggers.getCombinedTagsList().set(i, str2 + "-TIMEPERIOD");
                }
            }
            if (this.palaeoGlossaryMap.containsKey(surface)) {
            }
            i++;
        }
        return runTaggers;
    }

    public String removeNBS(String str) {
        if (REMOVE_NBS.matcher(str).find()) {
            str.toString().replaceAll(" ", " ");
            System.out.println("found NBS" + str.toString().replaceAll(" ", " "));
        }
        return str.toString().replaceAll(" ", " ");
    }
}
