package pl.edu.icm.coansys.similarity.pig.udf;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DefaultDataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import pl.edu.icm.coansys.commons.java.DiacriticsRemover;
import pl.edu.icm.coansys.commons.java.PorterStemmer;
import pl.edu.icm.coansys.commons.java.StackTraceExtractor;

/* loaded from: input_file:pl/edu/icm/coansys/similarity/pig/udf/ExtendedStemmedPairs.class */
public class ExtendedStemmedPairs extends EvalFunc<DataBag> {
    private String TYPE_OF_REMOVAL;
    private static final String SPACE = " ";
    private AllLangStopWordFilter stowordsFilter;

    public Schema outputSchema(Schema schema) {
        try {
            return new Schema(new Schema.FieldSchema(getSchemaName(getClass().getName().toLowerCase(), schema), new Schema(new Schema.FieldSchema("term", new Schema(new Schema.FieldSchema("value", (byte) 55)), (byte) 110)), (byte) 120));
        } catch (Exception e) {
            this.log.error("Error in the output Schema creation", e);
            this.log.error(StackTraceExtractor.getStackTrace(e));
            return null;
        }
    }

    public ExtendedStemmedPairs() throws IOException {
        this.TYPE_OF_REMOVAL = "latin";
        this.stowordsFilter = null;
        this.stowordsFilter = new AllLangStopWordFilter();
    }

    public ExtendedStemmedPairs(String str) throws IOException {
        this.TYPE_OF_REMOVAL = "latin";
        this.stowordsFilter = null;
        this.TYPE_OF_REMOVAL = str;
        this.stowordsFilter = new AllLangStopWordFilter();
    }

    public List<String> getStemmedPairs(String str) throws IOException {
        String replaceAll = str.toLowerCase().replaceAll("[_]+", "_").replaceAll("[-]+", "-");
        if (!"latin".equals(this.TYPE_OF_REMOVAL)) {
            replaceAll = replaceAll.replaceAll("([^\\u0080-\\uFFFF a-zA-Z_\\-\\d\\s'])+", SPACE);
        }
        String trim = replaceAll.replaceAll("\\s+", SPACE).trim();
        ArrayList arrayList = new ArrayList();
        if (trim.length() == 0) {
            return arrayList;
        }
        PorterStemmer porterStemmer = new PorterStemmer();
        for (String str2 : StringUtils.split(trim, SPACE)) {
            String replaceAll2 = str2.replaceAll("^[/\\-]+", "").replaceAll("[\\-/]+$", "");
            if ("latin".equals(this.TYPE_OF_REMOVAL)) {
                replaceAll2 = replaceAll2.replaceAll("[^a-z\\d\\-_/ ]+", SPACE);
            }
            if (replaceAll2.length() > 3 && !this.stowordsFilter.isInAllStopwords(replaceAll2).booleanValue()) {
                String removeDiacritics = DiacriticsRemover.removeDiacritics(replaceAll2);
                porterStemmer.add(removeDiacritics.toCharArray(), removeDiacritics.length());
                porterStemmer.stem();
                arrayList.add(porterStemmer.toString());
            }
        }
        return arrayList;
    }

    /* renamed from: exec, reason: merged with bridge method [inline-methods] */
    public DataBag m16exec(Tuple tuple) throws IOException {
        if (tuple == null || tuple.size() == 0 || tuple.get(0) == null) {
            return null;
        }
        try {
            ArrayList arrayList = new ArrayList();
            Iterator<String> it = getStemmedPairs((String) tuple.get(0)).iterator();
            while (it.hasNext()) {
                arrayList.add(TupleFactory.getInstance().newTuple(it.next()));
            }
            return new DefaultDataBag(arrayList);
        } catch (Exception e) {
            throw new IOException("Caught exception processing input row ", e);
        }
    }

    public static void main(String[] strArr) {
        System.out.println("PartA: " + DiacriticsRemover.removeDiacritics("100688"));
    }
}
