package pl.edu.icm.coansys.deduplication.organization;

import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import java.nio.charset.Charset;
import java.util.Locale;
import org.apache.hadoop.io.BytesWritable;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.WritableFactory$;
import org.apache.spark.graphx.Edge;
import org.apache.spark.graphx.Graph;
import org.apache.spark.graphx.Graph$;
import org.apache.spark.graphx.lib.ConnectedComponents$;
import org.apache.spark.rdd.RDD;
import org.apache.spark.rdd.RDD$;
import org.apache.spark.rdd.SequenceFileRDDFunctions;
import pl.edu.icm.coansys.models.OrganizationProtos;
import scala.Tuple2;
import scala.math.Ordering$Long$;
import scala.math.Ordering$String$;
import scala.reflect.ClassTag$;
import scala.runtime.ScalaRunTime$;

/* compiled from: DoDeduplication.scala */
/* loaded from: input_file:pl/edu/icm/coansys/deduplication/organization/DoDeduplication$.class */
public final class DoDeduplication$ {
    public static final DoDeduplication$ MODULE$ = null;

    static {
        new DoDeduplication$();
    }

    public String simplify(String str) {
        return str.toLowerCase(Locale.ENGLISH).replaceAll("[^a-z0-9]", "");
    }

    public HashFunction hf() {
        return Hashing.md5();
    }

    public long longHash(String str) {
        return hf().newHasher().putString((CharSequence) str, Charset.forName("UTF-8")).hash().asLong();
    }

    public String getOrganizationName(OrganizationProtos.OrganizationWrapper organizationWrapper) {
        return organizationWrapper.getOrganizationMetadata().getOriginalNameCount() > 0 ? organizationWrapper.getOrganizationMetadata().getOriginalName(0) : organizationWrapper.getOrganizationMetadata().getEnglishNameCount() > 0 ? organizationWrapper.getOrganizationMetadata().getEnglishName(0) : "";
    }

    public RDD<byte[]> dedupOrganizations(RDD<OrganizationProtos.OrganizationWrapper> rdd) {
        RDD flatMap = RDD$.MODULE$.rddToPairRDDFunctions(rdd.flatMap(new DoDeduplication$$anonfun$1(), ClassTag$.MODULE$.apply(Tuple2.class)), ClassTag$.MODULE$.apply(String.class), ClassTag$.MODULE$.apply(ScalaRunTime$.MODULE$.arrayClass(Byte.TYPE)), Ordering$String$.MODULE$).groupByKey().flatMap(new DoDeduplication$$anonfun$2(), ClassTag$.MODULE$.apply(Tuple2.class));
        Graph apply = Graph$.MODULE$.apply(rdd.map(new DoDeduplication$$anonfun$5(), ClassTag$.MODULE$.apply(Tuple2.class)), flatMap.map(new DoDeduplication$$anonfun$6(), ClassTag$.MODULE$.apply(Edge.class)), Graph$.MODULE$.apply$default$3(), Graph$.MODULE$.apply$default$4(), Graph$.MODULE$.apply$default$5(), ClassTag$.MODULE$.apply(ScalaRunTime$.MODULE$.arrayClass(Byte.TYPE)), ClassTag$.MODULE$.apply(String.class));
        return RDD$.MODULE$.rddToPairRDDFunctions(RDD$.MODULE$.rddToPairRDDFunctions(apply.vertices(), ClassTag$.MODULE$.apply(Long.TYPE), ClassTag$.MODULE$.apply(ScalaRunTime$.MODULE$.arrayClass(Byte.TYPE)), Ordering$Long$.MODULE$).cogroup(ConnectedComponents$.MODULE$.run(apply, ClassTag$.MODULE$.apply(ScalaRunTime$.MODULE$.arrayClass(Byte.TYPE)), ClassTag$.MODULE$.apply(String.class)).vertices()).flatMap(new DoDeduplication$$anonfun$7(), ClassTag$.MODULE$.apply(Tuple2.class)), ClassTag$.MODULE$.Long(), ClassTag$.MODULE$.apply(ScalaRunTime$.MODULE$.arrayClass(Byte.TYPE)), Ordering$Long$.MODULE$).groupByKey().map(new DoDeduplication$$anonfun$8(), ClassTag$.MODULE$.apply(ScalaRunTime$.MODULE$.arrayClass(Byte.TYPE)));
    }

    public void main(String[] strArr) {
        String str = strArr[0];
        SparkContext sparkContext = new SparkContext(new SparkConf().setAppName("Organization deduplication"));
        RDD map = sparkContext.sequenceFile(str, sparkContext.sequenceFile$default$2(), ClassTag$.MODULE$.apply(String.class), ClassTag$.MODULE$.apply(BytesWritable.class), new DoDeduplication$$anonfun$9(), new DoDeduplication$$anonfun$10()).map(new DoDeduplication$$anonfun$11(), ClassTag$.MODULE$.apply(Tuple2.class));
        if (map.isEmpty()) {
            SequenceFileRDDFunctions rddToSequenceFileRDDFunctions = RDD$.MODULE$.rddToSequenceFileRDDFunctions(map, ClassTag$.MODULE$.apply(String.class), ClassTag$.MODULE$.apply(ScalaRunTime$.MODULE$.arrayClass(Byte.TYPE)), WritableFactory$.MODULE$.stringWritableFactory(), WritableFactory$.MODULE$.bytesWritableFactory());
            rddToSequenceFileRDDFunctions.saveAsSequenceFile(strArr[1], rddToSequenceFileRDDFunctions.saveAsSequenceFile$default$2());
        } else {
            SequenceFileRDDFunctions rddToSequenceFileRDDFunctions2 = RDD$.MODULE$.rddToSequenceFileRDDFunctions(dedupOrganizations(map.map(new DoDeduplication$$anonfun$12(), ClassTag$.MODULE$.apply(OrganizationProtos.OrganizationWrapper.class))).map(new DoDeduplication$$anonfun$13(), ClassTag$.MODULE$.apply(Tuple2.class)), ClassTag$.MODULE$.apply(String.class), ClassTag$.MODULE$.apply(ScalaRunTime$.MODULE$.arrayClass(Byte.TYPE)), WritableFactory$.MODULE$.stringWritableFactory(), WritableFactory$.MODULE$.bytesWritableFactory());
            rddToSequenceFileRDDFunctions2.saveAsSequenceFile(strArr[1], rddToSequenceFileRDDFunctions2.saveAsSequenceFile$default$2());
        }
    }

    private DoDeduplication$() {
        MODULE$ = this;
    }
}
