package uk.ac.open.crc.intt;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:uk/ac/open/crc/intt/BasicTokeniser.class */
class BasicTokeniser {
    private final AggregatedDictionary aggregatedDictionary;
    private final DigitAbbreviationDictionary digitAbbreviationDictionary;
    private final AbbreviationDictionary abbreviationDicitonary;
    private final NumericTokeniser numericTokeniser;
    private static final Pattern lcUcPattern = Pattern.compile("\\p{Ll}\\p{Lu}");
    private static final Pattern ucLcPattern = Pattern.compile("\\p{Lu}\\p{Ll}");
    private static final Set<String> escapableCharacters = new HashSet();
    private static final Logger LOGGER;
    private final Pattern separatorPattern;
    private final ArrayList<String> words = new ArrayList<>();
    private final HashSet<String> separatorCharactersSet = new HashSet<>();

    /* JADX INFO: Access modifiers changed from: package-private */
    public BasicTokeniser(DictionarySet dictionarySet, String str) {
        this.aggregatedDictionary = dictionarySet.getAggregatedDictionary();
        this.numericTokeniser = new NumericTokeniser(dictionarySet);
        this.digitAbbreviationDictionary = dictionarySet.getDigitAbbreviationDictionary();
        this.abbreviationDicitonary = dictionarySet.getAbbreviationDictionary();
        new ArrayList();
        str = str.isEmpty() ? "_$" : str;
        for (Integer num = 0; num.intValue() < str.length(); num = Integer.valueOf(num.intValue() + 1)) {
            String substring = str.substring(num.intValue(), num.intValue() + 1);
            if (escapableCharacters.contains(substring)) {
                this.separatorCharactersSet.add(String.format("\\%s", substring));
            } else {
                this.separatorCharactersSet.add(substring);
            }
        }
        this.separatorPattern = Pattern.compile((String) this.separatorCharactersSet.stream().collect(Collectors.joining("", "[", "]+")));
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public List<String> naiveTokensation(String str) {
        return tokeniseOnLowercaseToUppercase(tokeniseOnSeparators(str));
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public List<String> tokenise(String str) {
        List<String> list = tokeniseOnLowercaseToUppercase(tokeniseOnSeparators(str));
        ArrayList arrayList = new ArrayList();
        for (String str2 : list) {
            if (str2.matches("^.*[0-9]+.*$")) {
                arrayList.addAll(tokeniseOnUppercaseToLowercase(this.numericTokeniser.tokenise(str2, str.endsWith(str2))));
            } else {
                arrayList.addAll(tokeniseOnUppercaseToLowercase(str2));
            }
        }
        return arrayList;
    }

    private List<String> tokeniseOnSeparators(String str) {
        return (List) this.separatorPattern.splitAsStream(str).filter(str2 -> {
            return str2.length() > 0;
        }).collect(Collectors.toList());
    }

    private ArrayList<String> tokeniseOnLowercaseToUppercase(String str) {
        ArrayList<String> arrayList = new ArrayList<>();
        Matcher matcher = lcUcPattern.matcher(str);
        int i = 0;
        while (true) {
            int i2 = i;
            if (!matcher.find()) {
                arrayList.add(str.substring(i2, str.length()));
                return arrayList;
            }
            arrayList.add(str.substring(i2, matcher.end() - 1));
            i = matcher.end() - 1;
        }
    }

    private List<String> tokeniseOnLowercaseToUppercase(List<String> list) {
        ArrayList arrayList = new ArrayList();
        list.stream().forEach(str -> {
            arrayList.addAll(tokeniseOnLowercaseToUppercase(str));
        });
        return arrayList;
    }

    private ArrayList<String> tokeniseOnUppercaseToLowercase(String str) {
        ArrayList<String> arrayList = new ArrayList<>();
        if (this.digitAbbreviationDictionary.isWord(str) || this.abbreviationDicitonary.isWord(str)) {
            arrayList.add(str);
        } else {
            Integer ucLcBoundary = getUcLcBoundary(str);
            if (ucLcBoundary.intValue() == -1 || ucLcBoundary.intValue() == 0) {
                arrayList.add(str);
            } else {
                ArrayList arrayList2 = new ArrayList();
                arrayList2.add(str.substring(0, ucLcBoundary.intValue()));
                arrayList2.add(str.substring(ucLcBoundary.intValue()));
                ArrayList arrayList3 = new ArrayList();
                arrayList3.add(str.substring(0, ucLcBoundary.intValue() + 1));
                arrayList3.add(str.substring(ucLcBoundary.intValue() + 1));
                if (this.aggregatedDictionary.percentageKnown(arrayList2) >= this.aggregatedDictionary.percentageKnown(arrayList3)) {
                    arrayList.addAll(arrayList2);
                } else {
                    arrayList.addAll(arrayList3);
                }
            }
        }
        return arrayList;
    }

    List<String> tokeniseOnUppercaseToLowercase(ArrayList<String> arrayList) {
        ArrayList arrayList2 = new ArrayList();
        arrayList.stream().forEach(str -> {
            arrayList2.addAll(tokeniseOnUppercaseToLowercase(str));
        });
        return arrayList2;
    }

    private Integer getUcLcBoundary(String str) {
        Matcher matcher = ucLcPattern.matcher(str);
        return Integer.valueOf(matcher.find() ? matcher.start() : -1);
    }

    static {
        Collections.addAll(escapableCharacters, "<", "(", "[", "{", "\\", "^", "-", "=", "$", "!", "|", "]", "}", ")", "?", "*", "+", ".", ">");
        LOGGER = LoggerFactory.getLogger(BasicTokeniser.class);
    }
}
