package uk.ac.cam.ch.wwmm.acpgeo;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import uk.ac.cam.ch.wwmm.chemicaltagger.ChemicalTaggerTokeniser;
import uk.ac.cam.ch.wwmm.chemicaltagger.Utils;
import uk.ac.cam.ch.wwmm.chemicaltagger.WhiteSpaceTokeniser;
import uk.ac.cam.ch.wwmm.oscar.document.Token;

/* loaded from: input_file:uk/ac/cam/ch/wwmm/acpgeo/ACPTokeniser.class */
public class ACPTokeniser implements ChemicalTaggerTokeniser {
    private static Pattern SPLIT_CHARACTER_PATTERN = Pattern.compile("[;<>‱‰%?]");
    private static Pattern PRESERVE_RATIO_WITHIN_BRACKETS_PATTERN = Pattern.compile("[^/]+[(][^/]+[/]\\S+[)]|[(][^/]+[/]\\S+[)][^/]+");
    private static Pattern PRESERVE_HYDROCARBON_PATTERN = Pattern.compile("[^=]*[CNHOP]+[0-9]*=[CNOP].*");
    private static Set<String> ABV_LIST = new HashSet(Arrays.asList("et.", "al.", "etc.", "e.g.", "i.e.", "vol.", "ca.", "wt.", "aq.", "mt.", "st.", "e.g.:", "eq.", "equiv.", "mp.", "m.p.", "b.p.", "(bp)", "conc.", "approx.", "anh.", "sat.", "lit.", "dil.", "sol.", "liq.", "Cal.", "cal.", "Prof."));
    private static Pattern ABBREVIATION_PATTERN = Pattern.compile("-?[A-Z]+[a-z]{0,2}\\.");
    private static Pattern CONCAT_HYPHENED_DIRECTION_PATTERN = Pattern.compile("^[A-Z]\\-\\d+[°º]");
    private static Pattern TIME_EXPRESSION = Pattern.compile("^(([01]?[1-9]|2[123]):[0-5]\\d\\-?){1,2}([ap]m)?$", 2);
    private static Pattern PRESERVE_CERTAIN_BRACKETS = Pattern.compile("[(]BP[)]", 2);
    private static Pattern IDENTIFIERS = Pattern.compile("^([A-Za-z]|[0-9]{1,2}|[0-9]{1,2}[A-Za-z]|[ivx]+|[IVX]+)$");

    public List<Token> tokenise(String str) {
        List<Token> list = new WhiteSpaceTokeniser().tokenise(str);
        int i = 0;
        while (i < list.size()) {
            Token token = list.get(i);
            String[] subTokenise = subTokenise(token.getSurface(), i > 0 ? list.get(i - 1).getSurface() : null);
            if (subTokenise != null) {
                int start = token.getStart();
                ArrayList arrayList = new ArrayList();
                for (String str2 : subTokenise) {
                    int length = start + str2.length();
                    arrayList.add(new Token(str2, start, length, token.getDoc(), token.getBioType(), token.getNeElem()));
                    start = length;
                }
                list.remove(i);
                list.addAll(i, arrayList);
            } else {
                i++;
            }
        }
        int i2 = 0;
        Iterator<Token> it = list.iterator();
        while (it.hasNext()) {
            int i3 = i2;
            i2++;
            it.next().setIndex(i3);
        }
        return list;
    }

    private String[] subTokenise(String str, String str2) {
        if (str.length() <= 1) {
            return null;
        }
        if (str.startsWith("(")) {
            if (indexOfBalancedRoundBracket(str) < 0) {
                return splitAtRegion(str, 1, 1);
            }
        } else if (str.endsWith(")") && str.substring(0, str.length() - 1).indexOf(40) < 0) {
            return splitAtRegion(str, str.length() - 1, str.length() - 1);
        }
        if (!PRESERVE_CERTAIN_BRACKETS.matcher(str).matches() && str.startsWith("(") && str.endsWith(")")) {
            return splitAtRegion(str, 1, str.length() - 1);
        }
        Matcher matcher = SPLIT_CHARACTER_PATTERN.matcher(str);
        if (matcher.find()) {
            return splitAtRegion(str, matcher.start(), matcher.end());
        }
        int indexOf = str.indexOf("/");
        if (indexOf != -1 && !PRESERVE_RATIO_WITHIN_BRACKETS_PATTERN.matcher(str).find()) {
            return splitAtRegion(str, indexOf, indexOf + 1);
        }
        int indexOf2 = str.indexOf("=");
        if (indexOf2 != -1 && !PRESERVE_HYDROCARBON_PATTERN.matcher(str).matches()) {
            return splitAtRegion(str, indexOf2, indexOf2 + 1);
        }
        if (str.endsWith(".") && notAnAbbreviation(str) && !IDENTIFIERS.matcher(str).matches()) {
            return splitAtRegion(str, str.length() - 1, str.length() - 1);
        }
        if (str.endsWith(".") && (str.contains("°") || str.contains("º"))) {
            return splitAtRegion(str, str.length() - 1, str.length() - 1);
        }
        if (str.equals("K.") && str2 != null && StringUtils.isNumeric(str2.replace(".", ""))) {
            return splitAtRegion(str, 1, 1);
        }
        if (str.endsWith(",")) {
            return splitAtRegion(str, str.length() - 1, str.length() - 1);
        }
        if (CONCAT_HYPHENED_DIRECTION_PATTERN.matcher(str).find()) {
            int indexOf3 = str.indexOf("-");
            return splitAtRegion(str, indexOf3, indexOf3 + 1);
        }
        int indexOf4 = str.indexOf(":");
        if (indexOf4 < 0 || TIME_EXPRESSION.matcher(str).find()) {
            return null;
        }
        return splitAtRegion(str, indexOf4, indexOf4 + 1);
    }

    private boolean notAnAbbreviation(String str) {
        return !ABV_LIST.contains(str.toLowerCase()) && (Utils.containsNumber(str) || !ABBREVIATION_PATTERN.matcher(str).find());
    }

    private static String[] splitAtRegion(String str, int i, int i2) {
        ArrayList arrayList = new ArrayList();
        String substring = str.substring(0, i);
        if (substring.length() > 0) {
            arrayList.add(substring);
        }
        String substring2 = str.substring(i, i2);
        if (substring2.length() > 0) {
            arrayList.add(substring2);
        }
        String substring3 = str.substring(i2);
        if (substring3.length() > 0) {
            arrayList.add(substring3);
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    private static int indexOfBalancedRoundBracket(String str) {
        int length = str.length();
        int i = 0;
        for (int i2 = 0; i2 < length; i2++) {
            if (str.charAt(i2) == '(') {
                i++;
            } else if (str.charAt(i2) == ')') {
                i--;
                if (i == 0) {
                    return i2;
                }
            } else {
                continue;
            }
        }
        return -1;
    }
}
