package ws.palladian.extraction.token;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang3.Validate;
import ws.palladian.classification.utils.CsvDatasetReaderConfig;
import ws.palladian.core.Instance;
import ws.palladian.core.Token;
import ws.palladian.extraction.sentence.PalladianSentenceDetector;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.constants.Language;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:ws/palladian/extraction/token/Tokenizer.class */
public final class Tokenizer {
    public static final String TOKEN_SPLIT_REGEX = "(?:[A-Z][a-z]?\\.)+|[\\p{L}\\w+]+(?:[-\\.,][\\p{L}\\w]+)*|\\.[\\p{L}\\w]+|</?[\\p{L}\\w]+>|\\$\\d+\\.\\d+|[^\\w\\s<]+";
    public static final String SENTENCE_SPLIT_REGEX_EN = "(?<!(\\.|\\()|([A-Z]\\.[A-Z]){1,10}|St|Mr|mr|Vers|Dr|dr|Prof|Nr|Rev|Mrs|mrs|Jr|jr|vs| eg|e\\.g|ca|max|Min|etc| sq| ft)((\\.|\\?|\\!)(’|”|\")+(?=\\s+[A-Z])|\\.|\\?+|\\!+)(?!(\\.|[0-9]|\"|”|'|\\)|[!?]|(com|de|fr|uk|au|ca|cn|org|net)/?\\s|\\()|[A-Za-z]{1,15}\\.|[A-Za-z]{1,15}\\(\\))";
    public static final String SENTENCE_SPLIT_REGEX_DE = "(?<!(\\.|\\()|([A-Z]\\.[A-Z]){1,10}|St|[mM]r|[dD]r|Ca|Mio|Mind|u\\.A|Inkl|Vers|Prof|[mM]s|zusätzl|äquiv|komp|quiet|elektr\\.|[jJ]r|vs|ca|engl|evtl|max|mind.|etc|Nr|Rev| sog| ident|bzw|i\\.d\\.R|v\\.a|u\\.v\\.m|o\\.k|zzgl|Min|Keyb|Elec|bspw|bsp|m\\.E|bezügl|bzgl|inkl|exkl|ggf|z\\.\\s?[bB]| max| min|\\s[a-z]|u\\.s\\.w|u\\.\\s?a|d\\.h)((\\.|\\?|\\!)(”|\")\\s[A-Z]|\\.|\\?+|\\!+)(?!(\\.|[0-9]|\"|”|'|\\)| B\\.|[!?]|(com|de|fr|uk|au|ca|cn|org|net)/?\\s|\\()|[A-Za-z]{1,15}\\.|[A-Za-z]{1,15}\\(\\))";

    private Tokenizer() {
    }

    public static List<String> tokenize(String str) {
        return CollectionHelper.newArrayList(CollectionHelper.convert(new WordTokenizer().iterateTokens(str), Token.VALUE_CONVERTER));
    }

    public static Set<String> calculateCharNGrams(String str, int i) {
        return CollectionHelper.newHashSet(CollectionHelper.convert(new CharacterNGramTokenizer(i, i).iterateTokens(str), Token.VALUE_CONVERTER));
    }

    public static Set<String> calculateAllCharEdgeNGrams(String str, int i, int i2) {
        return calculateAllCharEdgeNGrams(str, i, i2, false);
    }

    public static Set<String> calculateAllCharEdgeNGrams(String str, int i, int i2, boolean z) {
        HashSet hashSet = new HashSet();
        for (String str2 : str.split(" ")) {
            for (int i3 = i; i3 <= i2; i3++) {
                hashSet.addAll(calculateCharEdgeNGrams(str2, i3, z));
            }
        }
        return hashSet;
    }

    public static Set<String> calculateCharEdgeNGrams(String str, int i, boolean z) {
        HashSet hashSet = new HashSet();
        int length = str.length();
        if (length < i) {
            return hashSet;
        }
        for (int i2 = 0; i2 <= length - i; i2++) {
            if ((i2 <= 0 || i2 == length - i) && (i != 1 || i2 <= 0 || i2 == length - i)) {
                StringBuilder sb = new StringBuilder();
                for (int i3 = i2; i3 < i2 + i; i3++) {
                    sb.append(str.charAt(i3));
                }
                hashSet.add(sb.toString());
                if (i2 == 0 && z) {
                    break;
                }
            }
        }
        return hashSet;
    }

    public static Set<String> calculateWordNGrams(String str, int i) {
        return calculateAllWordNGrams(str, i, i);
    }

    public static List<String> calculateWordNGramsAsList(String str, int i) {
        return CollectionHelper.newArrayList(CollectionHelper.convert(new NGramWrapperIterator(new WordTokenizer().iterateTokens(str), i, i), Token.VALUE_CONVERTER));
    }

    public static Set<String> calculateAllCharNGrams(String str, int i, int i2) {
        return CollectionHelper.newHashSet(CollectionHelper.convert(new CharacterNGramTokenizer(i, i2).iterateTokens(str), Token.VALUE_CONVERTER));
    }

    public static Set<String> calculateAllWordNGrams(String str, int i, int i2) {
        return CollectionHelper.newHashSet(CollectionHelper.convert(new NGramWrapperIterator(new WordTokenizer().iterateTokens(str), i, i2), Token.VALUE_CONVERTER));
    }

    public static Set<List<String>> computeSplits(String str, int i, int i2, int i3) {
        Validate.notEmpty(str);
        Validate.notNull(str);
        HashSet hashSet = new HashSet();
        computeSplits(hashSet, new ArrayList(), str, i, i2, i3);
        return hashSet;
    }

    private static void computeSplits(Set<List<String>> set, List<String> list, String str, int i, int i2, int i3) {
        if (str.isEmpty()) {
            set.add(new ArrayList(list));
            list.remove(list.size() - 1);
        } else {
            if (set.size() >= i3) {
                return;
            }
            for (String str2 : computeStartingWordNGrams(str, i, i2)) {
                list.add(str2);
                computeSplits(set, list, str.startsWith(str2) ? str.substring(str2.length()).trim() : str, i, i2, i3);
            }
            if (list.isEmpty()) {
                return;
            }
            list.remove(list.size() - 1);
        }
    }

    public static List<String> computeStartingWordNGrams(String str, int i, int i2) {
        ArrayList arrayList = new ArrayList();
        String[] split = str.split(" ");
        int min = Math.min(i2, split.length);
        for (int i3 = 0; i3 < Math.min(split.length, (min - i) + 1); i3++) {
            String str2 = Instance.NO_CATEGORY_DUMMY;
            for (int i4 = 0; i4 < min - i3; i4++) {
                str2 = str2 + split[i4] + " ";
            }
            String trim = str2.trim();
            if (!trim.isEmpty()) {
                arrayList.add(trim);
            }
        }
        return arrayList;
    }

    public static String getSentence(String str, int i) {
        return getSentence(str, i, Language.ENGLISH);
    }

    private static String getSentence(String str, int i, Language language) {
        if (i < 0) {
            return str;
        }
        List<String> sentences = getSentences(str, language);
        String str2 = Instance.NO_CATEGORY_DUMMY;
        for (String str3 : sentences) {
            if (str.indexOf(str3) > i) {
                break;
            }
            str2 = str3;
        }
        return str2;
    }

    public static List<String> getSentences(String str, boolean z) {
        return getSentences(str, z, Language.ENGLISH);
    }

    public static List<String> getSentences(String str, boolean z, Language language) {
        List<String> convertList = CollectionHelper.convertList(CollectionHelper.newArrayList(new PalladianSentenceDetector(language).iterateTokens(str)), Token.VALUE_CONVERTER);
        if (z) {
            ArrayList arrayList = new ArrayList();
            Iterator<String> it = convertList.iterator();
            while (it.hasNext()) {
                String[] split = it.next().split("\n");
                String str2 = split[split.length - 1];
                if (str2.endsWith(".") || str2.endsWith(CsvDatasetReaderConfig.Builder.DEFAULT_NULL_VALUE) || str2.endsWith("!") || str2.endsWith(".”") || str2.endsWith(".\"")) {
                    String trim = StringHelper.trim(str2, "“”\"");
                    int countWhitespaces = StringHelper.countWhitespaces(trim) + 1;
                    if (trim.length() > 8 && countWhitespaces > 2) {
                        arrayList.add(str2.trim());
                    }
                }
            }
            convertList = arrayList;
        }
        return convertList;
    }

    public static List<String> getSentences(String str) {
        return getSentences(str, Language.ENGLISH);
    }

    public static List<String> getSentences(String str, Language language) {
        return getSentences(str, false, language);
    }

    public static String getPhraseFromBeginningOfSentence(String str) {
        String removeDoubleWhitespaces = StringHelper.removeDoubleWhitespaces(str);
        int max = Math.max(removeDoubleWhitespaces.lastIndexOf("."), removeDoubleWhitespaces.lastIndexOf("\n"));
        boolean z = false;
        while (!z && max > -1 && max < removeDoubleWhitespaces.length() - 1) {
            if (max > 0) {
                z = !StringHelper.isNumber(removeDoubleWhitespaces.charAt(max - 1)) && Character.isUpperCase(removeDoubleWhitespaces.charAt(max + 1));
            }
            if (!z && max < removeDoubleWhitespaces.length() - 2) {
                z = (Character.isUpperCase(removeDoubleWhitespaces.charAt(max + 2)) || removeDoubleWhitespaces.charAt(max + 2) == '-' || removeDoubleWhitespaces.charAt(max + 2) == '=') && removeDoubleWhitespaces.charAt(max + 1) == ' ';
            }
            if (!z && (removeDoubleWhitespaces.charAt(max + 1) == '\n' || removeDoubleWhitespaces.charAt(max) == '\n')) {
                z = true;
            }
            if (z) {
                break;
            }
            max = max < removeDoubleWhitespaces.length() - 1 ? removeDoubleWhitespaces.substring(0, max).lastIndexOf(".") : -1;
        }
        if (removeDoubleWhitespaces.lastIndexOf("!") > -1 && removeDoubleWhitespaces.lastIndexOf("!") > max) {
            max = removeDoubleWhitespaces.lastIndexOf("!");
        }
        if (removeDoubleWhitespaces.lastIndexOf(CsvDatasetReaderConfig.Builder.DEFAULT_NULL_VALUE) > -1 && removeDoubleWhitespaces.lastIndexOf(CsvDatasetReaderConfig.Builder.DEFAULT_NULL_VALUE) > max) {
            max = removeDoubleWhitespaces.lastIndexOf(CsvDatasetReaderConfig.Builder.DEFAULT_NULL_VALUE);
        }
        if (removeDoubleWhitespaces.lastIndexOf(":") > -1 && removeDoubleWhitespaces.lastIndexOf(":") > max) {
            max = removeDoubleWhitespaces.lastIndexOf(":");
        }
        if (max == -1) {
            max = -1;
        }
        String substring = removeDoubleWhitespaces.substring(max + 1);
        if (substring.startsWith(" ")) {
            substring = substring.substring(1);
        }
        return substring;
    }

    public static String getPhraseToEndOfSentence(String str) {
        int indexOf = str.indexOf(".");
        boolean z = false;
        while (!z && indexOf > -1) {
            if (indexOf > 0) {
                z = !StringHelper.isNumber(str.charAt(indexOf - 1));
            }
            if (indexOf < str.length() - 1) {
                z = (!StringHelper.isNumber(str.charAt(indexOf + 1)) && Character.isUpperCase(str.charAt(indexOf + 1))) || StringHelper.isBracket(str.charAt(indexOf + 1)) || (indexOf > 0 && str.charAt(indexOf - 1) == '\"');
            }
            if (!z && indexOf < str.length() - 2) {
                z = !StringHelper.isNumber(str.charAt(indexOf + 2)) && (Character.isUpperCase(str.charAt(indexOf + 2)) || StringHelper.isBracket(str.charAt(indexOf + 2))) && str.charAt(indexOf + 1) == ' ';
            }
            if (!z && (str.length() == indexOf + 1 || str.charAt(indexOf + 1) == '\n')) {
                z = true;
            }
            if (z) {
                break;
            }
            indexOf = indexOf < str.length() - 1 ? str.indexOf(".", indexOf + 1) : -1;
        }
        if (str.contains("!") && (str.indexOf("!") < indexOf || indexOf == -1)) {
            indexOf = str.indexOf("!");
        }
        if (str.contains(CsvDatasetReaderConfig.Builder.DEFAULT_NULL_VALUE) && (str.indexOf(CsvDatasetReaderConfig.Builder.DEFAULT_NULL_VALUE) < indexOf || indexOf == -1)) {
            indexOf = str.indexOf(CsvDatasetReaderConfig.Builder.DEFAULT_NULL_VALUE);
        }
        return str.substring(0, indexOf == -1 ? str.length() : indexOf + 1);
    }

    public static void main(String[] strArr) throws IOException {
        StopWatch stopWatch = new StopWatch();
        for (int i = 0; i < 1000; i++) {
            getSentences("Zum Einen ist das Ding ein bisschen groß und es sieht sehr merkwürdig aus, wenn man damit durch die Stadt läuft und es am Ohr hat und zum Anderen ein bisschen unhandlich.\nNun möchte ich noch etwas über die Akkulaufzeit sagen.");
        }
        System.out.println(stopWatch.getElapsedTimeString());
        System.exit(0);
        System.out.println(getSentences("the quick brown fox"));
        String readFileToString = FileHelper.readFileToString("data/test/tokenizerProblem.txt");
        int i2 = 0;
        Iterator<String> it = tokenize(readFileToString).iterator();
        while (it.hasNext()) {
            if (it.next().equals("Number")) {
                i2++;
            }
        }
        System.out.println("# occurences 1 : " + i2);
        int i3 = 0;
        for (String str : getSentences(readFileToString)) {
            FileHelper.appendFile("sentences.txt", str + "\n");
            Iterator<String> it2 = tokenize(str).iterator();
            while (it2.hasNext()) {
                if (it2.next().equals("Number")) {
                    i3++;
                }
            }
        }
        System.out.println("# occurences 2 : " + i3);
    }
}
