package eu.interedition.text.token;

import com.google.common.collect.Lists;
import eu.interedition.text.Annotation;
import eu.interedition.text.AnnotationRepository;
import eu.interedition.text.Name;
import eu.interedition.text.Range;
import eu.interedition.text.Text;
import eu.interedition.text.TextConstants;
import eu.interedition.text.event.AnnotationEventListener;
import eu.interedition.text.event.AnnotationEventSource;
import eu.interedition.text.mem.SimpleAnnotation;
import eu.interedition.text.mem.SimpleName;
import eu.interedition.text.query.Criteria;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:eu/interedition/text/token/Tokenizer.class */
public class Tokenizer {
    public static final Name DEFAULT_TOKEN_NAME = new SimpleName(TextConstants.INTEREDITION_NS_URI, "token");
    private static final Logger LOG = LoggerFactory.getLogger(Tokenizer.class);
    private AnnotationRepository annotationRepository;
    private AnnotationEventSource eventSource;
    private Name tokenName = DEFAULT_TOKEN_NAME;
    private int pageSize = 102400;
    private int batchSize = 1024;

    /* loaded from: input_file:eu/interedition/text/token/Tokenizer$TokenGeneratingListener.class */
    private class TokenGeneratingListener implements AnnotationEventListener {
        private final TokenizerSettings settings;
        private final Text text;
        private List<Annotation> batch;
        private boolean lastIsTokenBoundary;
        private int offset;
        private int tokenStart;
        private int tokenCount;

        private TokenGeneratingListener(Text text, TokenizerSettings tokenizerSettings) {
            this.batch = Lists.newArrayListWithExpectedSize(Tokenizer.this.batchSize);
            this.lastIsTokenBoundary = true;
            this.offset = 0;
            this.tokenStart = Integer.MAX_VALUE;
            this.tokenCount = 0;
            this.settings = tokenizerSettings;
            this.text = text;
        }

        @Override // eu.interedition.text.event.AnnotationEventListener
        public void start() {
            Tokenizer.LOG.debug("Tokenizing " + this.text);
        }

        @Override // eu.interedition.text.event.AnnotationEventListener
        public void start(long j, Iterable<Annotation> iterable) {
            if (this.settings.startingAnnotationsAreBoundary(this.text, j, iterable)) {
                this.lastIsTokenBoundary = true;
            }
        }

        @Override // eu.interedition.text.event.AnnotationEventListener
        public void empty(long j, Iterable<Annotation> iterable) {
            if (this.settings.emptyAnnotationsAreBoundary(this.text, j, iterable)) {
                this.lastIsTokenBoundary = true;
            }
        }

        @Override // eu.interedition.text.event.AnnotationEventListener
        public void end(long j, Iterable<Annotation> iterable) {
            if (this.settings.endingAnnotationsAreBoundary(this.text, j, iterable)) {
                this.lastIsTokenBoundary = true;
            }
        }

        @Override // eu.interedition.text.event.AnnotationEventListener
        public void text(Range range, String str) {
            for (char c : str.toCharArray()) {
                if (this.settings.isBoundary(this.text, this.offset, c)) {
                    this.lastIsTokenBoundary = true;
                } else {
                    if (this.lastIsTokenBoundary) {
                        token();
                    }
                    if (this.tokenStart > this.offset) {
                        this.tokenStart = this.offset;
                    }
                    this.lastIsTokenBoundary = false;
                }
                this.offset++;
            }
        }

        @Override // eu.interedition.text.event.AnnotationEventListener
        public void end() {
            token();
            emit();
            Tokenizer.LOG.debug(this.text + " has " + this.tokenCount + " token(s)");
        }

        private void token() {
            if (this.tokenStart < this.offset) {
                this.batch.add(new SimpleAnnotation(this.text, Tokenizer.this.tokenName, new Range(this.tokenStart, this.offset), null));
                if (this.batch.size() % Tokenizer.this.batchSize == 0) {
                    emit();
                }
                this.tokenCount++;
                this.tokenStart = Integer.MAX_VALUE;
            }
        }

        private void emit() {
            Tokenizer.this.annotationRepository.create(this.batch);
            this.batch.clear();
        }
    }

    public void setAnnotationRepository(AnnotationRepository annotationRepository) {
        this.annotationRepository = annotationRepository;
    }

    public void setEventSource(AnnotationEventSource annotationEventSource) {
        this.eventSource = annotationEventSource;
    }

    public void setTokenName(Name name) {
        this.tokenName = name;
    }

    public void setPageSize(int i) {
        this.pageSize = i;
    }

    public void setBatchSize(int i) {
        this.batchSize = i;
    }

    public void tokenize(Text text, TokenizerSettings tokenizerSettings) throws IOException {
        this.annotationRepository.delete(Criteria.and(Criteria.text(text), Criteria.annotationName(this.tokenName)));
        this.eventSource.listen(new TokenGeneratingListener(text, tokenizerSettings), this.pageSize, text, Criteria.none(), Collections.emptySet());
    }
}
