package org.apache.lucene.analysis.jate;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import opennlp.tools.util.Span;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;

/* loaded from: input_file:org/apache/lucene/analysis/jate/OpenNLPMWEFilter.class */
public abstract class OpenNLPMWEFilter extends MWEFilter {
    protected final CharTermAttribute termAtt;
    protected final OffsetAttribute offsetAtt;
    protected final TypeAttribute typeAtt;
    protected boolean first;
    protected List<AttributeSource> tokenAttrs;
    protected Map<Integer, List<Integer>> chunkSpans;
    protected Map<Integer, String> chunkTypes;
    protected int chunkStart;
    protected List<Integer> chunkEnds;
    protected int tokenIdx;
    private static Logger LOG = Logger.getLogger(OpenNLPMWEFilter.class.getSimpleName());
    protected static String SENTENCE_BREAK = "[.?!]";

    public OpenNLPMWEFilter(TokenStream tokenStream, int i, int i2, int i3, int i4, boolean z, boolean z2, boolean z3, boolean z4, boolean z5, boolean z6, boolean z7, Set<String> set, boolean z8) {
        super(tokenStream, i, i2, i3, i4, z, z2, z3, z4, z5, z6, z7, set, z8);
        this.termAtt = addAttribute(CharTermAttribute.class);
        this.offsetAtt = addAttribute(OffsetAttribute.class);
        this.typeAtt = addAttribute(TypeAttribute.class);
        this.first = true;
        this.tokenAttrs = new ArrayList();
        this.chunkSpans = new HashMap();
        this.chunkTypes = new HashMap();
        this.chunkStart = -1;
        this.chunkEnds = new ArrayList();
        this.tokenIdx = 0;
    }

    protected OpenNLPMWEFilter(TokenStream tokenStream) {
        super(tokenStream);
        this.termAtt = addAttribute(CharTermAttribute.class);
        this.offsetAtt = addAttribute(OffsetAttribute.class);
        this.typeAtt = addAttribute(TypeAttribute.class);
        this.first = true;
        this.tokenAttrs = new ArrayList();
        this.chunkSpans = new HashMap();
        this.chunkTypes = new HashMap();
        this.chunkStart = -1;
        this.chunkEnds = new ArrayList();
        this.tokenIdx = 0;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public boolean addMWE(int i) {
        AttributeSource attributeSource = this.tokenAttrs.get(this.chunkStart);
        AttributeSource attributeSource2 = this.tokenAttrs.get(i - 1);
        SentenceContext parseSentenceContextPayload = parseSentenceContextPayload((PayloadAttribute) attributeSource.getAttribute(PayloadAttribute.class));
        SentenceContext parseSentenceContextPayload2 = parseSentenceContextPayload((PayloadAttribute) attributeSource2.getAttribute(PayloadAttribute.class));
        boolean z = false;
        if (!crossBoundary(parseSentenceContextPayload, parseSentenceContextPayload2)) {
            StringBuilder sb = new StringBuilder();
            for (int i2 = this.chunkStart; i2 <= i - 1; i2++) {
                sb.append(this.tokenAttrs.get(i2).getAttribute(CharTermAttribute.class).buffer()).append(ComplexShingleFilter.DEFAULT_TOKEN_SEPARATOR);
            }
            String str = null;
            boolean z2 = false;
            if (this.maxCharLength != 0 || this.minCharLength != 0) {
                str = stripSymbolChars(sb.toString().trim());
                if (str.length() <= this.maxCharLength && str.length() >= this.minCharLength) {
                    z2 = true;
                }
            }
            if (z2) {
                this.termAtt.setEmpty().append(str);
                this.offsetAtt.setOffset(attributeSource.getAttribute(OffsetAttribute.class).startOffset(), attributeSource2.getAttribute(OffsetAttribute.class).endOffset());
                this.typeAtt.setType(this.chunkTypes.get(Integer.valueOf(this.chunkStart)));
                addSentenceContextPayload(parseSentenceContextPayload, parseSentenceContextPayload2);
                z = true;
            }
        }
        this.chunkEnds.remove(Integer.valueOf(i));
        if (this.chunkEnds.size() == 0) {
            this.tokenIdx = this.chunkStart + 1;
            this.chunkStart = -1;
        } else {
            this.tokenIdx = this.chunkEnds.get(0).intValue();
        }
        return z;
    }

    private SentenceContext parseSentenceContextPayload(PayloadAttribute payloadAttribute) {
        BytesRef payload = payloadAttribute != null ? payloadAttribute.getPayload() : null;
        return payload == null ? null : new SentenceContext(payload.utf8ToString());
    }

    private void addSentenceContextPayload(SentenceContext sentenceContext, SentenceContext sentenceContext2) {
        if (sentenceContext == null || sentenceContext2 == null) {
            return;
        }
        addSentenceContext(this.sentenceContext, sentenceContext.getFirstTokenIdx(), sentenceContext2.getLastTokenIdx(), sentenceContext.getPosTag(), sentenceContext2.getSentenceId());
    }

    private boolean crossBoundary(SentenceContext sentenceContext, SentenceContext sentenceContext2) {
        return (sentenceContext == null || sentenceContext2 == null || sentenceContext.getSentenceId() == sentenceContext2.getSentenceId()) ? false : true;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Span[] prune(Span[] spanArr, String[] strArr) {
        HashSet hashSet = new HashSet();
        ArrayList arrayList = new ArrayList(Arrays.asList(spanArr));
        Iterator it = arrayList.iterator();
        ArrayList arrayList2 = new ArrayList();
        if (this.removeLeadingStopwords || this.removeTrailingStopwords || this.removeLeadingSymbolicTokens || this.removeTrailingSymbolicTokens) {
            while (it.hasNext()) {
                Span span = (Span) it.next();
                int[] clean = clean(span.getStart(), span.getEnd(), strArr);
                if (clean == null) {
                    it.remove();
                } else if (clean[0] != span.getStart() || clean[1] != span.getEnd()) {
                    arrayList2.add(new Span(clean[0], clean[1], span.getType(), span.getProb()));
                    it.remove();
                }
            }
        }
        arrayList.addAll(arrayList2);
        Collections.sort(arrayList);
        Iterator it2 = arrayList.iterator();
        while (it2.hasNext()) {
            Span span2 = (Span) it2.next();
            if (span2.getEnd() - span2.getStart() > this.maxTokens) {
                it2.remove();
            } else if (span2.getEnd() - span2.getStart() < this.minTokens) {
                it2.remove();
            } else {
                String str = span2.getStart() + "," + span2.getEnd();
                if (hashSet.contains(str)) {
                    it2.remove();
                } else {
                    hashSet.add(str);
                }
            }
        }
        Collections.sort(arrayList);
        return (Span[]) arrayList.toArray(new Span[0]);
    }

    protected int[] clean(int i, int i2, String[] strArr) {
        int i3 = i;
        int i4 = i2;
        if (this.removeLeadingStopwords) {
            String str = strArr[i3];
            if (this.stopWordsIgnoreCase) {
                str = str.toLowerCase();
            }
            if (this.stopWords.contains(str)) {
                i3++;
                if (i3 >= i4) {
                    return null;
                }
            }
        }
        if (this.removeTrailingStopwords) {
            String str2 = strArr[i4 - 1];
            if (this.stopWordsIgnoreCase) {
                str2 = str2.toLowerCase();
            }
            if (this.stopWords.contains(str2)) {
                i4--;
                if (i3 >= i4) {
                    return null;
                }
            }
        }
        if (this.removeLeadingSymbolicTokens && strArr[i3].replaceAll("[\\p{Punct}]", "").length() == 0) {
            i3++;
            if (i3 >= i4) {
                return null;
            }
        }
        if (this.removeLeadingSymbolicTokens && strArr[i4 - 1].replaceAll("[\\p{Punct}]", "").length() == 0) {
            i4--;
            if (i3 >= i4) {
                return null;
            }
        }
        if ((i4 != i2 || i3 != i) && i4 - i3 != 1) {
            return clean(i3, i4, strArr);
        }
        return new int[]{i3, i4};
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void resetParams() {
        this.first = true;
        this.tokenIdx = 0;
        this.chunkStart = -1;
        this.chunkEnds.clear();
        this.chunkSpans.clear();
        this.chunkTypes.clear();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* JADX WARN: Type inference failed for: r0v17, types: [java.lang.String[], java.lang.String[][]] */
    public String[][] walkTokens() throws IOException {
        String obj;
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        while (this.input.incrementToken()) {
            CharTermAttribute attribute = this.input.getAttribute(CharTermAttribute.class);
            OffsetAttribute attribute2 = this.input.getAttribute(OffsetAttribute.class);
            try {
                obj = new String(attribute.buffer(), 0, attribute2.endOffset() - attribute2.startOffset());
            } catch (StringIndexOutOfBoundsException e) {
                LOG.error(ExceptionUtils.getFullStackTrace(e));
                obj = attribute2.toString();
            }
            arrayList.add(obj);
            PayloadAttribute attribute3 = this.input.getAttribute(PayloadAttribute.class);
            if (attribute3 != null) {
                arrayList2.add(new SentenceContext(attribute3.getPayload().utf8ToString()).getPosTag());
            }
            this.tokenAttrs.add(this.input.cloneAttributes());
        }
        if (arrayList.size() != arrayList2.size()) {
            StringBuilder sb = new StringBuilder(getClass().getName());
            sb.append(" requires both token and token POS. Tokens=").append(arrayList.size()).append(", POS=").append(arrayList2.size()).append(", and they are inconsistent.").append(" Have you enabled POS tagging in your Solr analyzer chain?");
            throw new IOException(sb.toString());
        }
        String[] strArr = new String[arrayList.size()];
        String[] strArr2 = new String[arrayList2.size()];
        for (int i = 0; i < strArr.length; i++) {
            strArr[i] = (String) arrayList.get(i);
            strArr2[i] = (String) arrayList2.get(i);
        }
        clearAttributes();
        return new String[]{strArr, strArr2};
    }

    public final void end() throws IOException {
        super.end();
        clearAttributes();
        this.tokenAttrs.clear();
    }

    public void reset() throws IOException {
        super.reset();
        clearAttributes();
        resetParams();
    }
}
