package com.kotlinnlp.languagedetector.utils;

import com.kotlinnlp.neuraltokenizer.NeuralTokenizer;
import com.kotlinnlp.neuraltokenizer.NeuralTokenizerModel;
import com.kotlinnlp.neuraltokenizer.Sentence;
import com.kotlinnlp.neuraltokenizer.Token;
import com.kotlinnlp.utils.IOKt;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import kotlin.Metadata;
import kotlin.TypeCastException;
import kotlin.collections.CollectionsKt;
import kotlin.collections.SetsKt;
import kotlin.io.TextStreamsKt;
import kotlin.jvm.internal.DefaultConstructorMarker;
import kotlin.jvm.internal.Intrinsics;
import kotlin.text.Charsets;
import kotlin.text.StringsKt;
import org.jetbrains.annotations.NotNull;

/* compiled from: TextTokenizer.kt */
@Metadata(mv = {1, 1, 15}, bv = {1, 0, 3}, k = 1, d1 = {"��R\n\u0002\u0018\u0002\n\u0002\u0010��\n��\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0010\"\n\u0002\u0010\f\n��\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n��\n\u0002\u0010!\n\u0002\u0010\u000e\n��\n\u0002\u0010\u0002\n\u0002\b\u0003\n\u0002\u0010\b\n\u0002\b\u0002\n\u0002\u0010 \n\u0002\b\u0003\n\u0002\u0010\u000b\n\u0002\b\u0002\u0018�� \u001c2\u00020\u0001:\u0001\u001cB\r\u0012\u0006\u0010\u0002\u001a\u00020\u0003¢\u0006\u0002\u0010\u0004J\b\u0010\u000f\u001a\u00020\u0010H\u0002J\u0018\u0010\u0011\u001a\u00020\u00102\u0006\u0010\u0012\u001a\u00020\u00072\u0006\u0010\u0013\u001a\u00020\u0014H\u0002J\b\u0010\u0015\u001a\u00020\u0010H\u0002J\u001c\u0010\u0016\u001a\b\u0012\u0004\u0012\u00020\u000e0\u00172\u0006\u0010\u0018\u001a\u00020\u000e2\u0006\u0010\u0013\u001a\u00020\u0014J\u000e\u0010\u0019\u001a\b\u0012\u0004\u0012\u00020\u000e0\u0017H\u0002J\f\u0010\u001a\u001a\u00020\u001b*\u00020\u000eH\u0002R\u0014\u0010\u0005\u001a\b\u0012\u0004\u0012\u00020\u00070\u0006X\u0082\u0004¢\u0006\u0002\n��R\u000e\u0010\b\u001a\u00020\tX\u0082\u0004¢\u0006\u0002\n��R\u000e\u0010\n\u001a\u00020\u000bX\u0082\u0004¢\u0006\u0002\n��R\u0014\u0010\f\u001a\b\u0012\u0004\u0012\u00020\u000e0\rX\u0082\u0004¢\u0006\u0002\n��¨\u0006\u001d"}, d2 = {"Lcom/kotlinnlp/languagedetector/utils/TextTokenizer;", "", "cjkModel", "Lcom/kotlinnlp/neuraltokenizer/NeuralTokenizerModel;", "(Lcom/kotlinnlp/neuraltokenizer/NeuralTokenizerModel;)V", "cjkChars", "", "", "cjkNeuralTokenizer", "Lcom/kotlinnlp/neuraltokenizer/NeuralTokenizer;", "tokenBuffer", "Ljava/lang/StringBuffer;", "tokens", "", "", "addToken", "", "processChar", "char", "maxTokensLength", "", "resetBuffers", "tokenize", "", "text", "tokenizeCJKTokens", "isCJK", "", "Companion", "languagedetector"})
/* loaded from: input_file:com/kotlinnlp/languagedetector/utils/TextTokenizer.class */
public final class TextTokenizer {
    private final StringBuffer tokenBuffer;
    private final List<String> tokens;
    private final Set<Character> cjkChars;
    private final NeuralTokenizer cjkNeuralTokenizer;
    private static final double MIN_CJK_CHARS_PERCENTAGE = 0.4d;
    public static final Companion Companion = new Companion(null);

    /* compiled from: TextTokenizer.kt */
    @Metadata(mv = {1, 1, 15}, bv = {1, 0, 3}, k = 1, d1 = {"��\u0012\n\u0002\u0018\u0002\n\u0002\u0010��\n\u0002\b\u0002\n\u0002\u0010\u0006\n��\b\u0086\u0003\u0018��2\u00020\u0001B\u0007\b\u0002¢\u0006\u0002\u0010\u0002R\u000e\u0010\u0003\u001a\u00020\u0004X\u0082T¢\u0006\u0002\n��¨\u0006\u0005"}, d2 = {"Lcom/kotlinnlp/languagedetector/utils/TextTokenizer$Companion;", "", "()V", "MIN_CJK_CHARS_PERCENTAGE", "", "languagedetector"})
    /* loaded from: input_file:com/kotlinnlp/languagedetector/utils/TextTokenizer$Companion.class */
    public static final class Companion {
        private Companion() {
        }

        public /* synthetic */ Companion(DefaultConstructorMarker defaultConstructorMarker) {
            this();
        }
    }

    @NotNull
    public final List<String> tokenize(@NotNull String str, int i) {
        Intrinsics.checkParameterIsNotNull(str, "text");
        if (!(i > 0)) {
            throw new IllegalArgumentException("Failed requirement.".toString());
        }
        resetBuffers();
        String str2 = str;
        for (int i2 = 0; i2 < str2.length(); i2++) {
            processChar(str2.charAt(i2), i);
        }
        if (this.tokenBuffer.length() > 0) {
            addToken();
        }
        return tokenizeCJKTokens();
    }

    private final void processChar(char c, int i) {
        if (!Character.isLetter(c)) {
            addToken();
            return;
        }
        this.tokenBuffer.append(c);
        if (this.tokenBuffer.length() >= i) {
            addToken();
        }
    }

    private final void addToken() {
        String stringBuffer = this.tokenBuffer.toString();
        Intrinsics.checkExpressionValueIsNotNull(stringBuffer, "this.tokenBuffer.toString()");
        if (stringBuffer.length() > 0) {
            this.tokens.add(stringBuffer);
        }
        this.tokenBuffer.setLength(0);
    }

    private final void resetBuffers() {
        this.tokens.clear();
        this.tokenBuffer.setLength(0);
    }

    private final List<String> tokenizeCJKTokens() {
        ArrayList arrayList = new ArrayList();
        for (String str : this.tokens) {
            if (isCJK(str)) {
                Iterator it = this.cjkNeuralTokenizer.tokenize(str).iterator();
                while (it.hasNext()) {
                    Iterator it2 = ((Sentence) it.next()).getTokens().iterator();
                    while (it2.hasNext()) {
                        arrayList.add(((Token) it2.next()).getForm());
                    }
                }
            } else {
                arrayList.add(str);
            }
        }
        return CollectionsKt.toList(arrayList);
    }

    private final boolean isCJK(@NotNull String str) {
        String str2 = str;
        int i = 0;
        for (int i2 = 0; i2 < str2.length(); i2++) {
            i += this.cjkChars.contains(Character.valueOf(str2.charAt(i2))) ? 1 : 0;
        }
        return ((double) i) / ((double) str.length()) >= MIN_CJK_CHARS_PERCENTAGE;
    }

    public TextTokenizer(@NotNull NeuralTokenizerModel neuralTokenizerModel) {
        Intrinsics.checkParameterIsNotNull(neuralTokenizerModel, "cjkModel");
        this.tokenBuffer = new StringBuffer();
        this.tokens = new ArrayList();
        List split$default = StringsKt.split$default(TextStreamsKt.readText(new InputStreamReader(IOKt.getResourceAsStream("CJKChars.txt"), Charsets.UTF_8)), new String[]{"\n"}, false, 0, 6, (Object) null);
        ArrayList arrayList = new ArrayList();
        for (Object obj : split$default) {
            if (((String) obj).length() > 0) {
                arrayList.add(obj);
            }
        }
        ArrayList arrayList2 = arrayList;
        ArrayList arrayList3 = new ArrayList(CollectionsKt.collectionSizeOrDefault(arrayList2, 10));
        Iterator it = arrayList2.iterator();
        while (it.hasNext()) {
            arrayList3.add(Character.valueOf(((String) it.next()).charAt(0)));
        }
        Object[] array = arrayList3.toArray(new Character[0]);
        if (array == null) {
            throw new TypeCastException("null cannot be cast to non-null type kotlin.Array<T>");
        }
        Character[] chArr = (Character[]) array;
        this.cjkChars = SetsKt.setOf((Character[]) Arrays.copyOf(chArr, chArr.length));
        this.cjkNeuralTokenizer = new NeuralTokenizer(neuralTokenizerModel, 0.0d, 0.0d, 6, (DefaultConstructorMarker) null);
    }
}
