package ws.palladian.extraction.sentence;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import ws.palladian.core.Annotation;
import ws.palladian.core.ImmutableToken;
import ws.palladian.core.Tagger;
import ws.palladian.core.Token;
import ws.palladian.extraction.entity.BracketTagger;
import ws.palladian.extraction.entity.DateAndTimeTagger;
import ws.palladian.extraction.entity.SmileyTagger;
import ws.palladian.extraction.entity.UrlTagger;
import ws.palladian.extraction.token.Tokenizer;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.constants.DateFormat;
import ws.palladian.helper.constants.Language;
import ws.palladian.helper.constants.RegExp;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:ws/palladian/extraction/sentence/PalladianSentenceDetector.class */
public final class PalladianSentenceDetector implements SentenceDetector {
    private static final Pattern PATTERN_EN = Pattern.compile(Tokenizer.SENTENCE_SPLIT_REGEX_EN);
    private static final Pattern PATTERN_DE = Pattern.compile(Tokenizer.SENTENCE_SPLIT_REGEX_DE);
    private static final DateFormat[] DATES_WITH_DOTS = {RegExp.DATE_EU_D_MM, RegExp.DATE_EU_D_MM_Y, RegExp.DATE_EU_D_MM_Y_T, RegExp.DATE_EU_D_MMMM, RegExp.DATE_EU_D_MMMM_Y, RegExp.DATE_EU_D_MMMM_Y_T, RegExp.DATE_EU_MM_Y, RegExp.DATE_USA_MMMM_D_Y, RegExp.DATE_USA_MMMM_D_Y_SEP, RegExp.DATE_USA_MMMM_D_Y_T, RegExp.DATE_USA_MMMM_D, RegExp.DATE_EUSA_MMMM_Y, RegExp.DATE_EUSA_YYYY_MMM_D};
    private static final DateAndTimeTagger DATE_TAGGER = new DateAndTimeTagger(DATES_WITH_DOTS);
    private static final char MASK_CHARACTER = 'M';
    private final Language language;
    private final List<Tagger> maskingTaggers;

    public PalladianSentenceDetector(Language language) {
        this(language, Arrays.asList(UrlTagger.INSTANCE, DATE_TAGGER, SmileyTagger.INSTANCE, BracketTagger.INSTANCE));
    }

    public PalladianSentenceDetector(Language language, List<Tagger> list) {
        Validate.notNull(language, "language must not be null", new Object[0]);
        this.language = language;
        this.maskingTaggers = list;
    }

    @Override // ws.palladian.core.TextTokenizer
    public Iterator<Token> iterateTokens(String str) {
        int i;
        Token createToken;
        ArrayList<Annotation> arrayList = new ArrayList(1);
        Iterator<Tagger> it = this.maskingTaggers.iterator();
        while (it.hasNext()) {
            arrayList.addAll(it.next().getAnnotations(str));
        }
        StringBuilder sb = new StringBuilder(str);
        for (Annotation annotation : arrayList) {
            sb.replace(annotation.getStartPosition(), annotation.getEndPosition(), StringUtils.repeat('M', annotation.getValue().length()));
        }
        String sb2 = sb.toString();
        ArrayList<Token> arrayList2 = new ArrayList(1);
        Matcher matcher = (this.language == Language.GERMAN ? PATTERN_DE : PATTERN_EN).matcher(sb2);
        int i2 = 0;
        while (true) {
            i = i2;
            if (!matcher.find()) {
                break;
            }
            int end = matcher.end();
            Token createToken2 = createToken(sb2, i, end);
            if (createToken2 != null) {
                arrayList2.add(createToken2);
            }
            i2 = end;
        }
        if (i < sb2.length() && (createToken = createToken(sb2, i, sb2.length())) != null) {
            arrayList2.add(createToken);
        }
        ArrayList arrayList3 = new ArrayList();
        for (Token token : arrayList2) {
            int startPosition = token.getStartPosition();
            arrayList3.add(new ImmutableToken(startPosition, str.substring(startPosition, startPosition + token.getValue().length())));
        }
        return CollectionHelper.unmodifiableIterator(arrayList3.iterator());
    }

    private static Token createToken(String str, int i, int i2) {
        String substring = str.substring(i, i2);
        String ltrim = StringHelper.ltrim(substring);
        int length = substring.length() - ltrim.length();
        String rtrim = StringHelper.rtrim(ltrim);
        if (rtrim.isEmpty()) {
            return null;
        }
        return new ImmutableToken(i + length, rtrim);
    }
}
