package gate.creole.tokeniser;

import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.ANNIEConstants;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.ResourceReference;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.creole.orthomatcher.OrthoMatcherRule;
import gate.jape.constraint.ConstraintPredicate;
import gate.util.BomStrippingInputStreamReader;
import gate.util.Err;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;
import java.io.BufferedReader;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.AbstractSet;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;

@CreoleResource(name = "GATE Unicode Tokeniser", comment = "A customisable Unicode tokeniser.", helpURL = "http://gate.ac.uk/userguide/sec:annie:tokeniser", icon = "tokeniser")
/* loaded from: input_file:gate/creole/tokeniser/SimpleTokeniser.class */
public class SimpleTokeniser extends AbstractLanguageAnalyser implements ANNIEConstants {
    private static final long serialVersionUID = 1411111968361716069L;
    public static final String SIMP_TOK_DOCUMENT_PARAMETER_NAME = "document";
    public static final String SIMP_TOK_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
    public static final String SIMP_TOK_RULES_URL_PARAMETER_NAME = "rulesURL";
    public static final String SIMP_TOK_ENCODING_PARAMETER_NAME = "encoding";
    protected String annotationSetName;
    protected FSMState initialState;
    protected DFSMState dInitialState;
    static String LHStoRHS = ConstraintPredicate.GREATER;
    protected static final Set<String> ignoreTokens;
    protected static final Map<Integer, Integer> typeIds;
    protected static final int maxTypeId;
    protected static final List<String> typeMnemonics;
    protected static final Map<String, Integer> stringTypeIds;
    private String rulesResourceName;
    private ResourceReference rulesURL;
    private String encoding;
    protected Set<FSMState> fsmStates = new HashSet();
    protected Set<DFSMState> dfsmStates = new HashSet();
    protected transient Map<Set<FSMState>, DFSMState> newStates = new HashMap();

    public Resource init() throws ResourceInstantiationException {
        if (this.rulesURL == null) {
            throw new ResourceInstantiationException("No URL provided for the rules!");
        }
        try {
            BufferedReader bufferedReader = new BufferedReader(new BomStrippingInputStreamReader(this.rulesURL.openStream(), this.encoding));
            Throwable th = null;
            try {
                this.initialState = new FSMState(this);
                StringBuffer stringBuffer = new StringBuffer(1024);
                for (String readLine = bufferedReader.readLine(); readLine != null; readLine = bufferedReader.readLine()) {
                    if (readLine.endsWith("\\")) {
                        stringBuffer.append(readLine.substring(0, readLine.length() - 1));
                    } else {
                        stringBuffer.append(readLine);
                        parseRule(stringBuffer.toString());
                        stringBuffer.delete(0, stringBuffer.length());
                    }
                }
                eliminateVoidTransitions();
                if (bufferedReader != null) {
                    if (0 != 0) {
                        try {
                            bufferedReader.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        bufferedReader.close();
                    }
                }
                return this;
            } finally {
            }
        } catch (TokeniserException e) {
            throw new ResourceInstantiationException(e);
        } catch (IOException e2) {
            throw new ResourceInstantiationException(e2);
        }
    }

    public void reset() {
        this.document = null;
    }

    void parseRule(String str) throws TokeniserException {
        if (str.startsWith("#") || str.startsWith("//")) {
            return;
        }
        StringTokenizer stringTokenizer = new StringTokenizer(str, "()+*|\" \t\f>", true);
        FSMState fSMState = new FSMState(this);
        this.initialState.put((UnicodeType) null, fSMState);
        FSMState parseLHS = parseLHS(fSMState, stringTokenizer, LHStoRHS);
        String str2 = OrthoMatcherRule.description;
        if (stringTokenizer.hasMoreTokens()) {
            str2 = stringTokenizer.nextToken("\f");
        }
        if (str2.length() > 0) {
            parseLHS.setRhs(str2);
        }
    }

    FSMState parseLHS(FSMState fSMState, StringTokenizer stringTokenizer, String str) throws TokeniserException {
        FSMState fSMState2;
        FSMState fSMState3 = fSMState;
        boolean z = false;
        LinkedList linkedList = new LinkedList();
        String skipIgnoreTokens = skipIgnoreTokens(stringTokenizer);
        if (null == skipIgnoreTokens) {
            return fSMState3;
        }
        while (!skipIgnoreTokens.equals(str)) {
            if (skipIgnoreTokens.equals("(")) {
                fSMState2 = parseLHS(fSMState3, stringTokenizer, ")");
            } else if (skipIgnoreTokens.equals("\"")) {
                String parseQuotedString = parseQuotedString(stringTokenizer, "\"");
                fSMState2 = new FSMState(this);
                Integer num = stringTypeIds.get(parseQuotedString);
                if (null == num) {
                    throw new InvalidRuleException("Invalid type: \"" + parseQuotedString + "\"");
                }
                fSMState3.put(new UnicodeType(num.intValue()), fSMState2);
            } else {
                String str2 = skipIgnoreTokens;
                fSMState2 = new FSMState(this);
                Integer num2 = stringTypeIds.get(str2);
                if (null == num2) {
                    throw new InvalidRuleException("Invalid type: \"" + str2 + "\"");
                }
                fSMState3.put(new UnicodeType(num2.intValue()), fSMState2);
            }
            skipIgnoreTokens = skipIgnoreTokens(stringTokenizer);
            if (null == skipIgnoreTokens) {
                throw new InvalidRuleException("Tokeniser rule ended too soon!");
            }
            if (skipIgnoreTokens.equals("|")) {
                z = true;
                linkedList.add(fSMState2);
                skipIgnoreTokens = skipIgnoreTokens(stringTokenizer);
                if (null == skipIgnoreTokens) {
                    throw new InvalidRuleException("Tokeniser rule ended too soon!");
                }
            } else {
                if (z) {
                    z = false;
                    linkedList.add(fSMState2);
                    fSMState2 = new FSMState(this);
                    Iterator it = linkedList.iterator();
                    while (it.hasNext()) {
                        ((FSMState) it.next()).put((UnicodeType) null, fSMState2);
                    }
                    linkedList.clear();
                }
                if (skipIgnoreTokens.equals("+")) {
                    fSMState2.put((UnicodeType) null, fSMState3);
                    FSMState fSMState4 = fSMState2;
                    fSMState2 = new FSMState(this);
                    fSMState4.put((UnicodeType) null, fSMState2);
                    skipIgnoreTokens = skipIgnoreTokens(stringTokenizer);
                    if (null == skipIgnoreTokens) {
                        throw new InvalidRuleException("Tokeniser rule ended too soon!");
                    }
                } else if (skipIgnoreTokens.equals("*")) {
                    fSMState3.put((UnicodeType) null, fSMState2);
                    fSMState2.put((UnicodeType) null, fSMState3);
                    FSMState fSMState5 = fSMState2;
                    fSMState2 = new FSMState(this);
                    fSMState5.put((UnicodeType) null, fSMState2);
                    skipIgnoreTokens = skipIgnoreTokens(stringTokenizer);
                    if (null == skipIgnoreTokens) {
                        throw new InvalidRuleException("Tokeniser rule ended too soon!");
                    }
                }
                fSMState3 = fSMState2;
            }
        }
        return fSMState3;
    }

    String parseQuotedString(StringTokenizer stringTokenizer, String str) throws TokeniserException {
        if (!stringTokenizer.hasMoreElements()) {
            return null;
        }
        String nextToken = stringTokenizer.nextToken();
        StringBuffer stringBuffer = new StringBuffer(1024);
        while (!nextToken.equals(str)) {
            stringBuffer.append(nextToken);
            if (!stringTokenizer.hasMoreElements()) {
                throw new InvalidRuleException("Tokeniser rule ended too soon!");
            }
            nextToken = stringTokenizer.nextToken();
        }
        return stringBuffer.toString();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static String skipIgnoreTokens(StringTokenizer stringTokenizer) {
        while (stringTokenizer.hasMoreTokens()) {
            String nextToken = stringTokenizer.nextToken();
            Iterator<String> it = ignoreTokens.iterator();
            boolean z = false;
            while (!z && it.hasNext()) {
                if (nextToken.equals(it.next())) {
                    z = true;
                }
            }
            if (!z) {
                return nextToken;
            }
        }
        return null;
    }

    private AbstractSet<FSMState> lambdaClosure(Set<FSMState> set) {
        LinkedList linkedList = new LinkedList(set);
        HashSet hashSet = new HashSet(set);
        while (!linkedList.isEmpty()) {
            Set<FSMState> nextSet = ((FSMState) linkedList.removeFirst()).nextSet((UnicodeType) null);
            if (null != nextSet) {
                for (FSMState fSMState : nextSet) {
                    if (!hashSet.contains(fSMState)) {
                        hashSet.add(fSMState);
                        linkedList.addFirst(fSMState);
                    }
                }
            }
        }
        return hashSet;
    }

    void eliminateVoidTransitions() throws TokeniserException {
        this.newStates.clear();
        HashSet hashSet = new HashSet();
        LinkedList linkedList = new LinkedList();
        DFSMState dFSMState = new DFSMState(this);
        HashSet hashSet2 = new HashSet();
        hashSet2.add(this.initialState);
        AbstractSet<FSMState> lambdaClosure = lambdaClosure(hashSet2);
        this.newStates.put(lambdaClosure, dFSMState);
        hashSet.add(lambdaClosure);
        HashSet hashSet3 = new HashSet();
        boolean z = false;
        for (FSMState fSMState : lambdaClosure) {
            if (fSMState.isFinal()) {
                String rhs = fSMState.getRhs();
                hashSet3.add(rhs);
                dFSMState.rhs = rhs;
                z = true;
            }
        }
        if (hashSet3.size() > 1) {
            Err.println("Warning, rule clash: " + hashSet3 + "\nSelected last definition: " + dFSMState.rhs);
        }
        if (z) {
            dFSMState.buildTokenDesc();
        }
        hashSet3.clear();
        linkedList.addFirst(lambdaClosure);
        this.dInitialState = dFSMState;
        while (!linkedList.isEmpty()) {
            Set set = (Set) linkedList.removeFirst();
            for (int i = 0; i < maxTypeId; i++) {
                HashSet hashSet4 = new HashSet();
                Iterator it = set.iterator();
                while (it.hasNext()) {
                    Set<FSMState> nextSet = ((FSMState) it.next()).nextSet(i);
                    if (null != nextSet) {
                        hashSet4.addAll(nextSet);
                    }
                }
                if (!hashSet4.isEmpty()) {
                    AbstractSet<FSMState> lambdaClosure2 = lambdaClosure(hashSet4);
                    DFSMState dFSMState2 = this.newStates.get(lambdaClosure2);
                    if (dFSMState2 == null) {
                        dFSMState2 = new DFSMState(this);
                        hashSet.add(lambdaClosure2);
                        linkedList.add(lambdaClosure2);
                        boolean z2 = false;
                        for (FSMState fSMState2 : lambdaClosure2) {
                            if (fSMState2.isFinal()) {
                                String rhs2 = fSMState2.getRhs();
                                hashSet3.add(rhs2);
                                dFSMState2.rhs = rhs2;
                                z2 = true;
                            }
                        }
                        if (hashSet3.size() > 1) {
                            Err.println("Warning, rule clash: " + hashSet3 + "\nSelected last definition: " + dFSMState2.rhs);
                        }
                        if (z2) {
                            dFSMState2.buildTokenDesc();
                        }
                        hashSet3.clear();
                        this.newStates.put(lambdaClosure2, dFSMState2);
                    }
                    this.newStates.get(set).put(i, dFSMState2);
                }
            }
        }
    }

    public String getFSMgml() {
        StringBuffer stringBuffer = new StringBuffer(1024);
        StringBuffer stringBuffer2 = new StringBuffer(1024);
        for (FSMState fSMState : this.fsmStates) {
            int index = fSMState.getIndex();
            stringBuffer.append("node[ id ");
            stringBuffer.append(index);
            stringBuffer.append(" label \"");
            stringBuffer.append(index);
            if (fSMState.isFinal()) {
                stringBuffer.append(",F\\n" + fSMState.getRhs());
            }
            stringBuffer.append("\"  ]\n");
            stringBuffer2.append(fSMState.getEdgesGML());
        }
        return "graph[ \ndirected 1\n" + stringBuffer.toString() + stringBuffer2.toString() + "]\n";
    }

    public String getDFSMgml() {
        StringBuffer stringBuffer = new StringBuffer(1024);
        StringBuffer stringBuffer2 = new StringBuffer(1024);
        for (DFSMState dFSMState : this.dfsmStates) {
            int index = dFSMState.getIndex();
            stringBuffer.append("node[ id ");
            stringBuffer.append(index);
            stringBuffer.append(" label \"");
            stringBuffer.append(index);
            if (dFSMState.isFinal()) {
                stringBuffer.append(",F\\n" + dFSMState.getRhs());
            }
            stringBuffer.append("\"  ]\n");
            stringBuffer2.append(dFSMState.getEdgesGML());
        }
        return "graph[ \ndirected 1\n" + stringBuffer.toString() + stringBuffer2.toString() + "]\n";
    }

    public void execute() throws ExecutionException {
        this.interrupted = false;
        if (this.document == null) {
            throw new ExecutionException("No document to tokenise!");
        }
        AnnotationSet annotations = (this.annotationSetName == null || this.annotationSetName.equals(OrthoMatcherRule.description)) ? this.document.getAnnotations() : this.document.getAnnotations(this.annotationSetName);
        fireStatusChanged("Tokenising " + this.document.getName() + "...");
        String obj = this.document.getContent().toString();
        int length = obj.length();
        DFSMState dFSMState = this.dInitialState;
        int i = 0;
        DFSMState dFSMState2 = null;
        int i2 = 0;
        int i3 = 0;
        while (i2 < length) {
            int codePointAt = obj.codePointAt(i2);
            int i4 = Character.isSupplementaryCodePoint(codePointAt) ? 2 : 1;
            DFSMState next = dFSMState.next(typeIds.get(Integer.valueOf(Character.getType(codePointAt))).intValue());
            if (null != next) {
                dFSMState = next;
                if (dFSMState.isFinal()) {
                    dFSMState2 = dFSMState;
                }
                i2 += i4;
            } else {
                FeatureMap newFeatureMap = Factory.newFeatureMap();
                if (null == dFSMState2) {
                    i2 = i + i4;
                    String substring = obj.substring(i, i2);
                    newFeatureMap.put("type", "UNKNOWN");
                    newFeatureMap.put("string", substring);
                    newFeatureMap.put("length", Integer.toString(substring.length()));
                    try {
                        annotations.add(Long.valueOf(i), Long.valueOf(i2), "DEFAULT_TOKEN", newFeatureMap);
                    } catch (InvalidOffsetException e) {
                        e.printStackTrace(Err.getPrintWriter());
                    }
                } else {
                    String substring2 = obj.substring(i, i2);
                    newFeatureMap.put("string", substring2);
                    newFeatureMap.put("length", Integer.toString(substring2.length()));
                    for (int i5 = 1; i5 < dFSMState2.getTokenDesc().length; i5++) {
                        newFeatureMap.put(dFSMState2.getTokenDesc()[i5][0], dFSMState2.getTokenDesc()[i5][1]);
                    }
                    try {
                        annotations.add(new Long(i), new Long(i2), dFSMState2.getTokenDesc()[0][0], newFeatureMap);
                    } catch (InvalidOffsetException e2) {
                        throw new GateRuntimeException(e2.toString());
                    }
                }
                dFSMState2 = null;
                dFSMState = this.dInitialState;
                i = i2;
            }
            if (i2 - i3 > 256) {
                fireProgressChanged((100 * i2) / length);
                i3 = i2;
                if (isInterrupted()) {
                    throw new ExecutionInterruptedException();
                }
            }
        }
        if (null != dFSMState2) {
            String substring3 = obj.substring(i, i2);
            FeatureMap newFeatureMap2 = Factory.newFeatureMap();
            newFeatureMap2.put("string", substring3);
            newFeatureMap2.put("length", Integer.toString(substring3.length()));
            for (int i6 = 1; i6 < dFSMState2.getTokenDesc().length; i6++) {
                newFeatureMap2.put(dFSMState2.getTokenDesc()[i6][0], dFSMState2.getTokenDesc()[i6][1]);
            }
            try {
                annotations.add(new Long(i), new Long(i2), dFSMState2.getTokenDesc()[0][0], newFeatureMap2);
            } catch (InvalidOffsetException e3) {
                throw new GateRuntimeException(e3.toString());
            }
        }
        reset();
        fireProcessFinished();
        fireStatusChanged("Tokenisation complete!");
    }

    @CreoleParameter(defaultValue = "resources/tokeniser/DefaultTokeniser.rules", comment = "The URL to the rules file", suffixes = "rules")
    public void setRulesURL(ResourceReference resourceReference) {
        this.rulesURL = resourceReference;
    }

    @Deprecated
    public void setRulesURL(URL url) {
        try {
            setRulesURL(new ResourceReference(url));
        } catch (URISyntaxException e) {
            throw new RuntimeException("Error converting URL to ResourceReference", e);
        }
    }

    public ResourceReference getRulesURL() {
        return this.rulesURL;
    }

    @CreoleParameter(comment = "The annotation set to be used for the generated annotations")
    @RunTime
    @Optional
    public void setAnnotationSetName(String str) {
        this.annotationSetName = str;
    }

    public String getAnnotationSetName() {
        return this.annotationSetName;
    }

    public void setRulesResourceName(String str) {
        this.rulesResourceName = str;
    }

    public String getRulesResourceName() {
        return this.rulesResourceName;
    }

    @CreoleParameter(defaultValue = "UTF-8", comment = "The encoding used for reading the definitions")
    public void setEncoding(String str) {
        this.encoding = str;
    }

    public String getEncoding() {
        return this.encoding;
    }

    static {
        try {
            Field[] fields = Class.forName("java.lang.Character").getFields();
            LinkedList<Field> linkedList = new LinkedList();
            for (int i = 0; i < fields.length; i++) {
                if (Modifier.isStatic(fields[i].getModifiers()) && fields[i].getName().indexOf("DIRECTIONALITY") == -1) {
                    linkedList.add(fields[i]);
                }
            }
            HashMap hashMap = new HashMap();
            maxTypeId = linkedList.size() - 1;
            String[] strArr = new String[maxTypeId + 1];
            HashMap hashMap2 = new HashMap();
            int i2 = 0;
            for (Field field : linkedList) {
                try {
                    if (field.getType().toString().equals("byte")) {
                        String name = field.getName();
                        hashMap.put(Integer.valueOf(field.getInt(null)), Integer.valueOf(i2));
                        strArr[i2] = name;
                        hashMap2.put(name, Integer.valueOf(i2));
                        i2++;
                    }
                } catch (Exception e) {
                    throw new GateRuntimeException(e.toString());
                }
            }
            typeIds = Collections.unmodifiableMap(hashMap);
            stringTypeIds = Collections.unmodifiableMap(hashMap2);
            HashSet hashSet = new HashSet();
            hashSet.add(" ");
            hashSet.add("\t");
            hashSet.add("\f");
            ignoreTokens = Collections.unmodifiableSet(hashSet);
            typeMnemonics = Collections.unmodifiableList(Arrays.asList(strArr));
        } catch (ClassNotFoundException e2) {
            throw new GateRuntimeException("Could not find the java.lang.Character class!");
        }
    }
}
