package pl.edu.icm.coansys.disambiguation.author.pig.extractor;

import java.util.HashSet;
import java.util.Iterator;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DefaultDataBag;
import org.apache.pig.data.TupleFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.edu.icm.coansys.models.DocumentProtos;

/* loaded from: input_file:pl/edu/icm/coansys/disambiguation/author/pig/extractor/EX_KEYWORDS_SPLIT.class */
public class EX_KEYWORDS_SPLIT extends DisambiguationExtractorDocument {
    private static final Logger logger = LoggerFactory.getLogger(EX_KEYWORDS_SPLIT.class);

    @Override // pl.edu.icm.coansys.disambiguation.author.pig.extractor.DisambiguationExtractorDocument
    public DataBag extract(Object obj) {
        DefaultDataBag defaultDataBag = new DefaultDataBag();
        HashSet hashSet = new HashSet();
        Iterator it = ((DocumentProtos.DocumentMetadata) obj).getKeywordsList().iterator();
        while (it.hasNext()) {
            Iterator it2 = ((DocumentProtos.KeywordsList) it.next()).getKeywordsList().iterator();
            while (it2.hasNext()) {
                for (String str : ((String) it2.next()).split("[\\W]+")) {
                    hashSet.add(normalizeExtracted(str));
                }
            }
        }
        for (Object obj2 : hashSet.toArray()) {
            defaultDataBag.add(TupleFactory.getInstance().newTuple(obj2));
        }
        return defaultDataBag;
    }

    @Override // pl.edu.icm.coansys.disambiguation.author.pig.extractor.DisambiguationExtractorDocument
    public DataBag extract(Object obj, String str) {
        DocumentProtos.DocumentMetadata documentMetadata = (DocumentProtos.DocumentMetadata) obj;
        DefaultDataBag defaultDataBag = new DefaultDataBag();
        HashSet hashSet = new HashSet();
        for (DocumentProtos.KeywordsList keywordsList : documentMetadata.getKeywordsList()) {
            if (keywordsList.getLanguage().equalsIgnoreCase(str)) {
                Iterator it = keywordsList.getKeywordsList().iterator();
                while (it.hasNext()) {
                    for (String str2 : ((String) it.next()).split("[\\W]+")) {
                        hashSet.add(normalizeExtracted(str2));
                    }
                }
            }
        }
        for (Object obj2 : hashSet.toArray()) {
            defaultDataBag.add(TupleFactory.getInstance().newTuple(obj2));
        }
        if (defaultDataBag.size() == 0) {
            logger.info("No keywords IN GIVEN LANG (" + str + ") out of " + documentMetadata.getKeywordsCount() + " keywords!");
        }
        return defaultDataBag;
    }
}
