package org.apache.lucene.analysis.jate;

import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.util.Span;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.BytesRef;

/* loaded from: input_file:org/apache/lucene/analysis/jate/OpenNLPTokenizer.class */
public final class OpenNLPTokenizer extends Tokenizer implements SentenceContextAware {
    private static final int DEFAULT_BUFFER_SIZE = 256;
    private static final Logger LOG = Logger.getLogger(OpenNLPTokenizer.class.getName());
    private int finalOffset;
    private final CharTermAttribute termAtt;
    private final OffsetAttribute offsetAtt;
    private final PayloadAttribute tokenMetadataAtt;
    private Map<Integer, Paragraph> sentsInParagraph;
    private Map<Paragraph, Integer> paragraphHasSents;
    private Map<Integer, Integer> sentIdsInParagraph;
    private Span[] sentences;
    private Span[][] words;
    private Span[] wordSet;
    boolean first;
    int indexSentence;
    int indexWord;
    private char[] fullText;
    private ParagraphChunker paragraphOp;
    private SentenceDetector sentenceOp;
    private opennlp.tools.tokenize.Tokenizer tokenizerOp;

    public OpenNLPTokenizer(AttributeFactory attributeFactory, SentenceDetector sentenceDetector, opennlp.tools.tokenize.Tokenizer tokenizer) {
        super(attributeFactory);
        this.termAtt = addAttribute(CharTermAttribute.class);
        this.offsetAtt = addAttribute(OffsetAttribute.class);
        this.tokenMetadataAtt = addAttribute(PayloadAttribute.class);
        this.sentsInParagraph = new HashMap();
        this.paragraphHasSents = new HashMap();
        this.sentIdsInParagraph = new HashMap();
        this.sentences = null;
        this.words = (Span[][]) null;
        this.wordSet = null;
        this.first = true;
        this.indexSentence = 0;
        this.indexWord = 0;
        this.paragraphOp = null;
        this.sentenceOp = null;
        this.tokenizerOp = null;
        this.termAtt.resizeBuffer(DEFAULT_BUFFER_SIZE);
        if (sentenceDetector == null && tokenizer == null) {
            throw new IllegalArgumentException("OpenNLPTokenizer: need one or both of Sentence Detector and Tokenizer");
        }
        this.sentenceOp = sentenceDetector;
        this.tokenizerOp = tokenizer;
    }

    public OpenNLPTokenizer(AttributeFactory attributeFactory, SentenceDetector sentenceDetector, opennlp.tools.tokenize.Tokenizer tokenizer, ParagraphChunker paragraphChunker) {
        super(attributeFactory);
        this.termAtt = addAttribute(CharTermAttribute.class);
        this.offsetAtt = addAttribute(OffsetAttribute.class);
        this.tokenMetadataAtt = addAttribute(PayloadAttribute.class);
        this.sentsInParagraph = new HashMap();
        this.paragraphHasSents = new HashMap();
        this.sentIdsInParagraph = new HashMap();
        this.sentences = null;
        this.words = (Span[][]) null;
        this.wordSet = null;
        this.first = true;
        this.indexSentence = 0;
        this.indexWord = 0;
        this.paragraphOp = null;
        this.sentenceOp = null;
        this.tokenizerOp = null;
        this.termAtt.resizeBuffer(DEFAULT_BUFFER_SIZE);
        if (sentenceDetector == null && tokenizer == null) {
            throw new IllegalArgumentException("OpenNLPTokenizer: need one or both of Sentence Detector and Tokenizer");
        }
        this.sentenceOp = sentenceDetector;
        this.tokenizerOp = tokenizer;
        this.paragraphOp = paragraphChunker;
    }

    public final boolean incrementToken() throws IOException {
        if (this.first) {
            loadAll();
            restartAtBeginning();
            this.first = false;
        }
        if (this.sentences.length == 0) {
            this.first = true;
            return false;
        }
        int start = this.sentences[this.indexSentence].getStart();
        if (this.wordSet == null) {
            this.wordSet = this.words[this.indexSentence];
        }
        clearAttributes();
        if (this.indexSentence >= this.sentences.length) {
            this.first = true;
            return false;
        }
        while (this.indexWord == this.wordSet.length) {
            this.indexSentence++;
            if (this.indexSentence >= this.sentences.length) {
                this.first = true;
                return false;
            }
            this.wordSet = this.words[this.indexSentence];
            this.indexWord = 0;
            start = this.sentences[this.indexSentence].getStart();
        }
        Span span = this.sentences[this.indexSentence];
        Span span2 = this.wordSet[this.indexWord];
        int start2 = span.getStart() + span2.getStart();
        this.termAtt.setEmpty();
        int end = span2.getEnd() - span2.getStart();
        if (this.termAtt.buffer().length < end) {
            this.termAtt.resizeBuffer(end);
        }
        this.termAtt.setLength(end);
        char[] buffer = this.termAtt.buffer();
        this.finalOffset = correctOffset(start + span2.getEnd());
        int correctOffset = correctOffset(span2.getStart() + start);
        for (int i = 0; i < end; i++) {
            buffer[i] = this.fullText[start2 + i];
        }
        if (this.finalOffset - correctOffset > end) {
            this.offsetAtt.setOffset(correctOffset, correctOffset + end);
            LOG.warn("Invalid token start and end offsets diff greater than term length. End offset is reset to be start+tokenlength. start=" + correctOffset + ", invalid end=" + this.finalOffset + ", termlength=" + end + ". See Issue 26 on JATE webpage");
        } else {
            this.offsetAtt.setOffset(correctOffset, this.finalOffset);
        }
        MWEMetadata addSentenceContext = addSentenceContext(new MWEMetadata(), this.indexWord, this.indexWord, null, this.indexSentence);
        if (this.paragraphOp != null) {
            Paragraph paragraph = this.sentsInParagraph.get(Integer.valueOf(span.getStart()));
            addOtherMetadata(addSentenceContext, paragraph.indexInDoc, this.paragraphHasSents.get(paragraph).intValue(), this.paragraphHasSents.size(), this.sentIdsInParagraph.get(Integer.valueOf(this.sentences[this.indexSentence].getStart())).intValue(), this.sentences.length);
        }
        addPayloadAttribute(this.tokenMetadataAtt, addSentenceContext);
        this.indexWord++;
        return true;
    }

    void restartAtBeginning() throws IOException {
        this.indexWord = 0;
        this.indexSentence = 0;
        this.indexWord = 0;
        this.finalOffset = 0;
        this.wordSet = null;
    }

    /* JADX WARN: Type inference failed for: r1v5, types: [opennlp.tools.util.Span[], opennlp.tools.util.Span[][]] */
    void loadAll() throws IOException {
        fillBuffer();
        String str = new String(this.fullText);
        detectSentences(str);
        if (this.paragraphOp != null) {
            detectParagraphs(str);
        }
        this.words = new Span[this.sentences.length];
        for (int i = 0; i < this.sentences.length; i++) {
            splitWords(i);
        }
    }

    void splitWords(int i) {
        Span span = this.sentences[i];
        this.words[i] = this.tokenizerOp.tokenizePos(String.copyValueOf(this.fullText, span.getStart(), span.getEnd() - span.getStart()));
    }

    void detectSentences(String str) throws IOException {
        this.sentences = this.sentenceOp.sentPosDetect(str);
    }

    void detectParagraphs(String str) {
        this.sentsInParagraph.clear();
        this.paragraphHasSents.clear();
        this.sentIdsInParagraph.clear();
        List<Paragraph> chunk = this.paragraphOp.chunk(str);
        if (chunk != null) {
            int i = 0;
            Paragraph paragraph = chunk.get(0);
            int i2 = 0;
            for (Span span : this.sentences) {
                if (span.getStart() < paragraph.startOffset || span.getStart() > paragraph.endOffset) {
                    int i3 = i + 1;
                    while (true) {
                        if (i3 < chunk.size()) {
                            paragraph = chunk.get(i3);
                            i2 = 0;
                            if (span.getStart() < paragraph.startOffset || span.getStart() > paragraph.endOffset) {
                                i3++;
                            } else {
                                this.sentsInParagraph.put(Integer.valueOf(span.getStart()), paragraph);
                                Integer num = this.paragraphHasSents.get(paragraph);
                                if (num == null) {
                                    num = 0;
                                }
                                this.paragraphHasSents.put(paragraph, Integer.valueOf(num.intValue() + 1));
                                i = i3;
                                this.sentIdsInParagraph.put(Integer.valueOf(span.getStart()), 0);
                                i2 = 0 + 1;
                            }
                        }
                    }
                } else {
                    this.sentsInParagraph.put(Integer.valueOf(span.getStart()), paragraph);
                    Integer num2 = this.paragraphHasSents.get(paragraph);
                    if (num2 == null) {
                        num2 = 0;
                    }
                    this.paragraphHasSents.put(paragraph, Integer.valueOf(num2.intValue() + 1));
                    this.sentIdsInParagraph.put(Integer.valueOf(span.getStart()), Integer.valueOf(i2));
                    i2++;
                }
            }
        }
    }

    void fillBuffer() throws IOException {
        this.fullText = IOUtils.toCharArray(this.input);
    }

    public final void end() {
        this.offsetAtt.setOffset(this.finalOffset, this.finalOffset);
    }

    public void reset() throws IOException {
        super.reset();
        clearAttributes();
        restartAtBeginning();
    }

    @Override // org.apache.lucene.analysis.jate.SentenceContextAware
    public MWEMetadata addSentenceContext(MWEMetadata mWEMetadata, int i, int i2, String str, int i3) {
        mWEMetadata.addMetaData(MWEMetadataType.FIRST_COMPOSING_TOKEN_ID_IN_SENT, String.valueOf(i));
        mWEMetadata.addMetaData(MWEMetadataType.LAST_COMPOSING_TOKEN_ID_IN_SENT, String.valueOf(i2));
        mWEMetadata.addMetaData(MWEMetadataType.POS, str);
        mWEMetadata.addMetaData(MWEMetadataType.SOURCE_SENTENCE_ID_IN_DOC, String.valueOf(i3));
        return mWEMetadata;
    }

    protected void addOtherMetadata(MWEMetadata mWEMetadata, int i, int i2, int i3, int i4, int i5) {
        mWEMetadata.addMetaData(MWEMetadataType.SOURCE_PARAGRAPH_ID_IN_DOC, String.valueOf(i));
        mWEMetadata.addMetaData(MWEMetadataType.SENTENCES_IN_PARAGRAPH, String.valueOf(i2));
        mWEMetadata.addMetaData(MWEMetadataType.PARAGRAPHS_IN_DOC, String.valueOf(i3));
        mWEMetadata.addMetaData(MWEMetadataType.SOURCE_SENTENCE_ID_IN_PARAGRAPH, String.valueOf(i4));
        mWEMetadata.addMetaData(MWEMetadataType.SENTENCES_IN_DOC, String.valueOf(i5));
    }

    public void addPayloadAttribute(PayloadAttribute payloadAttribute, MWEMetadata mWEMetadata) {
        payloadAttribute.setPayload(new BytesRef(MWEMetadata.serialize(mWEMetadata)));
    }
}
