package pl.edu.icm.coansys.classification.documents.pig.proceeders;

import com.google.common.base.Joiner;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import org.apache.pig.EvalFunc;
import org.apache.pig.PigServer;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DefaultDataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import pl.edu.icm.coansys.classification.documents.auxil.StackTraceExtractor;
import pl.edu.icm.coansys.commons.java.PorterStemmer;
import pl.edu.icm.coansys.commons.java.StopWordsRemover;
import pl.edu.icm.coansys.disambiguation.auxil.DiacriticsRemover;

/* loaded from: input_file:pl/edu/icm/coansys/classification/documents/pig/proceeders/STEMMED_PAIRS.class */
public class STEMMED_PAIRS extends EvalFunc<DataBag> {
    public Schema outputSchema(Schema schema) {
        try {
            return Schema.generateNestedSchema((byte) 110, new byte[]{55, 55});
        } catch (FrontendException e) {
            throw new IllegalStateException((Throwable) e);
        }
    }

    /* renamed from: exec, reason: merged with bridge method [inline-methods] */
    public DataBag m29exec(Tuple tuple) throws IOException {
        if (tuple == null || tuple.size() == 0) {
            return null;
        }
        try {
            String str = (String) tuple.get(0);
            String[] strArr = new String[3];
            for (int i = 1; i < 4; i++) {
                Object obj = tuple.get(i);
                strArr[i - 1] = obj == null ? "" : obj.toString();
            }
            String replaceAll = DiacriticsRemover.removeDiacritics(Joiner.on(" ").join(strArr).toLowerCase()).replaceAll("[^a-z ]", "");
            PorterStemmer porterStemmer = new PorterStemmer();
            ArrayList arrayList = new ArrayList();
            for (String str2 : replaceAll.split(" ")) {
                if (!StopWordsRemover.isAnEnglishStopWords(str2)) {
                    porterStemmer.add(str2.toCharArray(), str2.length());
                    porterStemmer.stem();
                    arrayList.add(TupleFactory.getInstance().newTuple(Arrays.asList(str, porterStemmer.toString())));
                }
            }
            return new DefaultDataBag(arrayList);
        } catch (Exception e) {
            throw new IOException("Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e));
        }
    }

    public static void main(String[] strArr) {
        try {
            runQuery(new PigServer("local"));
        } catch (Exception e) {
        }
    }

    public static void runQuery(PigServer pigServer) throws IOException {
        pigServer.registerJar("target/document-classification-1.0-SNAPSHOT-jar-with-depedencies.jar");
        pigServer.registerQuery("raw = LOAD 'hbase://testProto' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('m:mproto','-loadKey true') AS (id:bytearray, proto:bytearray);");
        pigServer.registerQuery("extracted = FOREACH raw GENERATE pl.edu.icm.coansys.classification.pig.EXTRACT(raw);");
        pigServer.registerQuery("DUMP raw;");
    }
}
