package pl.edu.icm.commoncrawl.filters.wholeDomain.sameKeywords;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.edu.icm.commoncrawl.filters.AbstractFilterAllValuesFromKeyAtOnceReducer;
import pl.edu.icm.commoncrawl.filters.Decision;
import pl.edu.icm.generated.protobuf.commoncrawl.ScholarRecordProtos;

/* loaded from: input_file:pl/edu/icm/commoncrawl/filters/wholeDomain/sameKeywords/SameKeywordsReducer.class */
public class SameKeywordsReducer extends AbstractFilterAllValuesFromKeyAtOnceReducer {
    private static final Logger log = LoggerFactory.getLogger(SameKeywordsReducer.class);
    double minCount = 0.5d;
    int minNumPagesToWork = 4;
    double confidenceReject = 0.9d;
    int maxNumOfKeywordsSets = 10000;
    int maxNumOfKeywords = 80;
    boolean isDomainLargeAndOK = false;
    boolean isDomainLargeAndBad = false;
    ArrayList<SetWithNumber> keywordsSetsList = null;
    int numPagesInDomain = 0;
    Decision.SingleFilterSupport sfs;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:pl/edu/icm/commoncrawl/filters/wholeDomain/sameKeywords/SameKeywordsReducer$SetWithNumber.class */
    public static class SetWithNumber {
        Set<String> keywordsOnPage;
        int numPages = 1;

        SetWithNumber() {
        }
    }

    @Override // pl.edu.icm.commoncrawl.filters.AbstractFilterReducer
    public final String getNameOfFilter() {
        return "SameKeywordsInDomainReducer";
    }

    void addSetToList(ArrayList<SetWithNumber> arrayList, Set<String> set) {
        if (this.isDomainLargeAndBad || this.isDomainLargeAndOK) {
            return;
        }
        Iterator<SetWithNumber> it = arrayList.iterator();
        while (it.hasNext()) {
            SetWithNumber next = it.next();
            if (next.keywordsOnPage.containsAll(set) && set.containsAll(next.keywordsOnPage)) {
                next.numPages++;
                return;
            }
        }
        SetWithNumber setWithNumber = new SetWithNumber();
        setWithNumber.keywordsOnPage = set;
        arrayList.add(setWithNumber);
        if (arrayList.size() > this.maxNumOfKeywordsSets) {
            if (isDomainOKForCurrentState()) {
                this.isDomainLargeAndOK = true;
            } else {
                this.isDomainLargeAndBad = true;
            }
            log.info("too much keywords sets clearing");
            arrayList.clear();
        }
    }

    @Override // pl.edu.icm.commoncrawl.filters.AbstractFilterAllValuesFromKeyAtOnceReducer
    protected void resetBeforeNextKey() {
        this.numPagesInDomain = 0;
        this.keywordsSetsList = new ArrayList<>();
        this.isDomainLargeAndOK = false;
        this.isDomainLargeAndBad = false;
    }

    @Override // pl.edu.icm.commoncrawl.filters.AbstractFilterAllValuesFromKeyAtOnceReducer
    protected void preprocessrecord(Decision decision) {
        this.numPagesInDomain++;
        TreeSet treeSet = new TreeSet();
        for (ScholarRecordProtos.MetaNameP metaNameP : decision.getSubject().getMetaNameList()) {
            if ("keywords".equalsIgnoreCase(metaNameP.getName()) && metaNameP.hasContent()) {
                int i = 0;
                boolean z = false;
                for (String str : metaNameP.getContent().split(",")) {
                    int i2 = i;
                    i++;
                    if (i2 < this.maxNumOfKeywords) {
                        treeSet.add(str.toLowerCase().trim());
                    } else if (!z) {
                        log.info("too much kewords skippng");
                        z = true;
                    }
                }
            }
        }
        addSetToList(this.keywordsSetsList, treeSet);
    }

    boolean isDomainOKForCurrentState() {
        boolean z = true;
        int i = this.numPagesInDomain;
        Iterator<SetWithNumber> it = this.keywordsSetsList.iterator();
        while (it.hasNext()) {
            if (it.next().keywordsOnPage.size() > 2 && r0.numPages / i > this.minCount) {
                z = false;
            }
        }
        return z;
    }

    @Override // pl.edu.icm.commoncrawl.filters.AbstractFilterAllValuesFromKeyAtOnceReducer
    protected void calculateStatiticsBeforeSecondRun() {
        this.sfs = null;
        if (this.numPagesInDomain >= this.minNumPagesToWork && !this.isDomainLargeAndOK) {
            if (this.isDomainLargeAndBad || !isDomainOKForCurrentState()) {
                this.sfs = new Decision.SingleFilterSupport(getNameOfFilter(), Decision.KIND.REJECT, this.confidenceReject);
            }
        }
    }

    @Override // pl.edu.icm.commoncrawl.filters.AbstractFilterAllValuesFromKeyAtOnceReducer
    protected void makeDecisionDuringReduce(Decision decision) {
        if (this.sfs != null) {
            decision.addSupport(this.sfs);
        }
    }
}
