package uk.ac.open.crc.intt;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* JADX INFO: Access modifiers changed from: package-private */
/* loaded from: input_file:uk/ac/open/crc/intt/BasicTokeniser.class */
public class BasicTokeniser {
    private final AggregatedDictionary aggregatedDictionary;
    private final DigitAbbreviationDictionary digitAbbreviationDictionary;
    private final AbbreviationDictionary abbreviationDicitonary;
    private final NumericTokeniser numericTokeniser;
    private static final Logger LOGGER = LoggerFactory.getLogger((Class<?>) BasicTokeniser.class);
    private final ArrayList<String> words = new ArrayList<>();

    @Deprecated
    private boolean needsOracle = false;
    private final ArrayList<Integer> boundaries = new ArrayList<>();
    private final HashSet<Integer> separatorCharactersSet = new HashSet<>();

    /* JADX INFO: Access modifiers changed from: package-private */
    public BasicTokeniser(DictionarySet dictionarySet, String str) {
        this.aggregatedDictionary = dictionarySet.getAggregatedDictionary();
        this.numericTokeniser = new NumericTokeniser(dictionarySet);
        this.digitAbbreviationDictionary = dictionarySet.getDigitAbbreviationDictionary();
        this.abbreviationDicitonary = dictionarySet.getAbbreviationDictionary();
        for (Integer num = 0; num.intValue() < str.length(); num = Integer.valueOf(num.intValue() + 1)) {
            this.separatorCharactersSet.add(Integer.valueOf(str.codePointAt(num.intValue())));
        }
    }

    @Deprecated
    List<String> naiveTokensationRetainCase(String str) {
        return tokeniseOnLowercaseToUppercase(tokeniseOnSeparators(str));
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public List<String> naiveTokensation(String str) {
        return tokeniseOnLowercaseToUppercase(tokeniseOnSeparators(str));
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public List<String> tokenise(String str) {
        List<String> list = tokeniseOnLowercaseToUppercase(tokeniseOnSeparators(str));
        ArrayList arrayList = new ArrayList();
        for (String str2 : list) {
            if (str2.matches("^.*[0-9]+.*$")) {
                arrayList.addAll(tokeniseOnUppercaseToLowercase(this.numericTokeniser.tokenise(str2, str.endsWith(str2))));
            } else {
                arrayList.addAll(tokeniseOnUppercaseToLowercase(str2));
            }
        }
        return arrayList;
    }

    @Deprecated
    private void findBoundaries(String str) {
        Boolean bool = true;
        for (Integer num = 0; num.intValue() < str.length(); num = Integer.valueOf(num.intValue() + 1)) {
            Integer valueOf = Integer.valueOf(str.codePointAt(num.intValue()));
            if (isSeparator(valueOf).booleanValue()) {
                if (!bool.booleanValue() && num.intValue() > 0 && !isSeparator(Integer.valueOf(str.codePointAt(num.intValue() - 1))).booleanValue()) {
                    this.boundaries.add(Integer.valueOf(num.intValue() - 1));
                }
            } else if (bool.booleanValue()) {
                bool = false;
                this.boundaries.add(num);
                if (num.intValue() == str.length() - 1) {
                    this.boundaries.add(num);
                }
            } else {
                if (Character.isLowerCase(valueOf.intValue())) {
                    if (Character.isUpperCase(str.codePointAt(num.intValue() - 1)) && num.intValue() > 2 && Character.isUpperCase(str.codePointAt(num.intValue() - 2))) {
                        this.boundaries.add(Integer.valueOf(num.intValue() - 2));
                        this.boundaries.add(Integer.valueOf(num.intValue() - 1));
                    } else if (isSeparator(Integer.valueOf(str.codePointAt(num.intValue() - 1))).booleanValue()) {
                        this.boundaries.add(num);
                    }
                } else if (Character.isUpperCase(valueOf.intValue())) {
                    if (Character.isLowerCase(str.codePointAt(num.intValue() - 1))) {
                        this.boundaries.add(Integer.valueOf(num.intValue() - 1));
                        this.boundaries.add(num);
                    } else if (isSeparator(Integer.valueOf(str.codePointAt(num.intValue() - 1))).booleanValue()) {
                        this.boundaries.add(num);
                    }
                } else if (!Character.isDigit(valueOf.intValue())) {
                    LOGGER.warn("Unexpected unicode character in: \"{}\"", str);
                } else if (isSeparator(Integer.valueOf(str.codePointAt(num.intValue() - 1))).booleanValue()) {
                    this.boundaries.add(num);
                }
                if (num.intValue() == str.length() - 1) {
                    this.boundaries.add(num);
                }
            }
        }
    }

    private Boolean isSeparator(Integer num) {
        return Boolean.valueOf(this.separatorCharactersSet.contains(num));
    }

    private ArrayList<String> tokeniseOnSeparators(String str) {
        ArrayList<String> arrayList = new ArrayList<>();
        ArrayList arrayList2 = new ArrayList();
        for (Integer num = 0; num.intValue() < str.length(); num = Integer.valueOf(num.intValue() + 1)) {
            if (isSeparator(Integer.valueOf(str.codePointAt(num.intValue()))).booleanValue()) {
                if (num.intValue() > 0 && !isSeparator(Integer.valueOf(str.codePointAt(num.intValue() - 1))).booleanValue()) {
                    arrayList2.add(Integer.valueOf(num.intValue() - 1));
                }
                if (num.intValue() < str.length() - 1 && !isSeparator(Integer.valueOf(str.codePointAt(num.intValue() + 1))).booleanValue()) {
                    arrayList2.add(Integer.valueOf(num.intValue() + 1));
                }
            } else {
                if (num.intValue() == 0) {
                    arrayList2.add(num);
                }
                if (num.intValue() == str.length() - 1) {
                    arrayList2.add(num);
                }
            }
        }
        if (arrayList2.size() % 2 == 1) {
            LOGGER.warn("Odd number of boundaries found for: \"{}\"", str);
        }
        for (int i = 0; i < arrayList2.size(); i += 2) {
            arrayList.add(str.substring(((Integer) arrayList2.get(i)).intValue(), ((Integer) arrayList2.get(i + 1)).intValue() + 1));
        }
        return arrayList;
    }

    private ArrayList<String> tokeniseOnLowercaseToUppercase(String str) {
        ArrayList<String> arrayList = new ArrayList<>();
        ArrayList arrayList2 = new ArrayList();
        for (Integer num = 0; num.intValue() < str.length(); num = Integer.valueOf(num.intValue() + 1)) {
            if (num.intValue() == 0) {
                arrayList2.add(num);
            } else if (Character.isUpperCase(str.codePointAt(num.intValue())) && Character.isLowerCase(str.codePointAt(num.intValue() - 1))) {
                arrayList2.add(Integer.valueOf(num.intValue() - 1));
                arrayList2.add(num);
            }
            if (num.intValue() == str.length() - 1) {
                arrayList2.add(num);
            }
        }
        if (arrayList2.size() % 2 == 1) {
            LOGGER.warn("Odd number of boundaries found for: \"{}\"", str);
        }
        for (int i = 0; i < arrayList2.size(); i += 2) {
            arrayList.add(str.substring(((Integer) arrayList2.get(i)).intValue(), ((Integer) arrayList2.get(i + 1)).intValue() + 1));
        }
        return arrayList;
    }

    private List<String> tokeniseOnLowercaseToUppercase(List<String> list) {
        ArrayList arrayList = new ArrayList();
        list.stream().forEach(str -> {
            arrayList.addAll(tokeniseOnLowercaseToUppercase(str));
        });
        return arrayList;
    }

    private ArrayList<String> tokeniseOnUppercaseToLowercase(String str) {
        ArrayList<String> arrayList = new ArrayList<>();
        if (this.digitAbbreviationDictionary.isWord(str) || this.abbreviationDicitonary.isWord(str)) {
            arrayList.add(str);
        } else {
            Integer ucLcBoundary = getUcLcBoundary(str);
            if (ucLcBoundary.intValue() == -1 || ucLcBoundary.intValue() == 0) {
                arrayList.add(str);
            } else {
                ArrayList arrayList2 = new ArrayList();
                arrayList2.add(str.substring(0, ucLcBoundary.intValue()));
                arrayList2.add(str.substring(ucLcBoundary.intValue()));
                ArrayList arrayList3 = new ArrayList();
                arrayList3.add(str.substring(0, ucLcBoundary.intValue() + 1));
                arrayList3.add(str.substring(ucLcBoundary.intValue() + 1));
                if (this.aggregatedDictionary.percentageKnown(arrayList2) >= this.aggregatedDictionary.percentageKnown(arrayList3)) {
                    arrayList.addAll(arrayList2);
                } else {
                    arrayList.addAll(arrayList3);
                }
            }
        }
        return arrayList;
    }

    ArrayList<String> tokeniseOnUppercaseToLowercase(ArrayList<String> arrayList) {
        ArrayList<String> arrayList2 = new ArrayList<>();
        arrayList.stream().forEach(str -> {
            arrayList2.addAll(tokeniseOnUppercaseToLowercase(str));
        });
        return arrayList2;
    }

    private Integer getUcLcBoundary(String str) {
        Integer num = -1;
        Integer num2 = 0;
        while (true) {
            if (num2.intValue() < str.length() - 1) {
                if (Character.isUpperCase(str.codePointAt(num2.intValue())) && Character.isLowerCase(str.codePointAt(num2.intValue() + 1))) {
                    num = num2;
                    break;
                }
                num2 = Integer.valueOf(num2.intValue() + 1);
            } else {
                break;
            }
        }
        return num;
    }
}
