package pl.edu.icm.commoncrawl.filters.wholeDomain.sameKeywords;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import org.apache.hadoop.io.WritableComparable;
import pl.edu.icm.commoncrawl.filters.AbstractFilterAllValuesFromKeyAtOnceReducer;
import pl.edu.icm.commoncrawl.filters.Decision;
import pl.edu.icm.generated.protobuf.commoncrawl.ScholarRecordProtos;

/* loaded from: input_file:pl/edu/icm/commoncrawl/filters/wholeDomain/sameKeywords/SameKeywordsReducer.class */
public class SameKeywordsReducer extends AbstractFilterAllValuesFromKeyAtOnceReducer {
    double minCount = 0.5d;
    int minNumPagesToWork = 4;
    double confidenceReject = 0.9d;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:pl/edu/icm/commoncrawl/filters/wholeDomain/sameKeywords/SameKeywordsReducer$SetWithNumber.class */
    public static class SetWithNumber {
        Set<String> keywordsOnPage;
        int numPages = 1;

        SetWithNumber() {
        }
    }

    @Override // pl.edu.icm.commoncrawl.filters.AbstractFilterReducer
    public final String getNameOfFilter() {
        return "SameKeywordsInDomainReducer";
    }

    void addSetToList(ArrayList<SetWithNumber> arrayList, Set<String> set) {
        Iterator<SetWithNumber> it = arrayList.iterator();
        while (it.hasNext()) {
            SetWithNumber next = it.next();
            if (next.keywordsOnPage.containsAll(set) && set.containsAll(next.keywordsOnPage)) {
                next.numPages++;
                return;
            }
        }
        SetWithNumber setWithNumber = new SetWithNumber();
        setWithNumber.keywordsOnPage = set;
        arrayList.add(setWithNumber);
    }

    @Override // pl.edu.icm.commoncrawl.filters.AbstractFilterAllValuesFromKeyAtOnceReducer
    protected List<Decision> makeDecisionsDuringReduce(WritableComparable<?> writableComparable, List<Decision> list) {
        if (list.size() < this.minNumPagesToWork) {
            return list;
        }
        ArrayList<SetWithNumber> arrayList = new ArrayList<>();
        for (Decision decision : list) {
            TreeSet treeSet = new TreeSet();
            for (ScholarRecordProtos.MetaNameP metaNameP : decision.getSubject().getMetaNameList()) {
                if ("keywords".equalsIgnoreCase(metaNameP.getName()) && metaNameP.hasContent()) {
                    for (String str : metaNameP.getContent().split(",")) {
                        treeSet.add(str.toLowerCase().trim());
                    }
                }
            }
            addSetToList(arrayList, treeSet);
        }
        boolean z = true;
        int size = list.size();
        Iterator<SetWithNumber> it = arrayList.iterator();
        while (it.hasNext()) {
            if (it.next().keywordsOnPage.size() > 2 && r0.numPages / size > this.minCount) {
                z = false;
            }
        }
        if (!z) {
            Iterator<Decision> it2 = list.iterator();
            while (it2.hasNext()) {
                it2.next().addSupport(new Decision.SingleFilterSupport(getNameOfFilter(), Decision.KIND.REJECT, this.confidenceReject));
            }
        }
        return list;
    }
}
