package pl.edu.icm.cermine.metadata.affiliation.tools;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import pl.edu.icm.cermine.metadata.model.AffiliationLabel;
import pl.edu.icm.cermine.parsing.model.Token;
import pl.edu.icm.cermine.parsing.tools.TextTokenizer;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.9.jar:pl/edu/icm/cermine/metadata/affiliation/tools/AffiliationTokenizer.class */
public class AffiliationTokenizer implements TextTokenizer<Token<AffiliationLabel>> {
    private static List<Token<AffiliationLabel>> asciiTextToTokens(String str, List<Integer> list) {
        int i;
        ArrayList arrayList = new ArrayList();
        Matcher matcher = Pattern.compile("\\d+|\\W|_").matcher(str);
        int i2 = 0;
        while (true) {
            i = i2;
            if (!matcher.find()) {
                break;
            }
            int start = matcher.start();
            int end = matcher.end();
            String substring = str.substring(i, start);
            if (!substring.equals("")) {
                arrayList.add(new Token(substring, list.get(i).intValue(), list.get(start).intValue()));
            }
            String substring2 = str.substring(start, end);
            if (!substring2.matches("\\s")) {
                arrayList.add(new Token(substring2, list.get(start).intValue(), list.get(end).intValue()));
            }
            i2 = end;
        }
        String substring3 = str.substring(i, str.length());
        if (!substring3.equals("")) {
            arrayList.add(new Token(substring3, list.get(i).intValue(), list.get(str.length()).intValue()));
        }
        return arrayList;
    }

    private static List<Integer> getAsciiSubstringIndices(String str) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = Pattern.compile("\\p{ASCII}").matcher(str);
        while (matcher.find()) {
            arrayList.add(Integer.valueOf(matcher.start()));
        }
        return arrayList;
    }

    private static String getSubstring(String str, List<Integer> list) {
        StringBuilder sb = new StringBuilder();
        Iterator<Integer> it = list.iterator();
        while (it.hasNext()) {
            sb.append(str.charAt(it.next().intValue()));
        }
        return sb.toString();
    }

    @Override // pl.edu.icm.cermine.parsing.tools.TextTokenizer
    public List<Token<AffiliationLabel>> tokenize(String str) {
        List<Integer> asciiSubstringIndices = getAsciiSubstringIndices(str);
        String substring = getSubstring(str, asciiSubstringIndices);
        asciiSubstringIndices.add(Integer.valueOf(str.length()));
        return asciiTextToTokens(substring, asciiSubstringIndices);
    }
}
