package ws.palladian.extraction.location.disambiguation;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ws.palladian.classification.dt.QuickDtLearner;
import ws.palladian.classification.dt.QuickDtModel;
import ws.palladian.core.ClassifyingTagger;
import ws.palladian.core.Instance;
import ws.palladian.core.InstanceBuilder;
import ws.palladian.extraction.location.DefaultCandidateExtractor;
import ws.palladian.extraction.location.Location;
import ws.palladian.extraction.location.LocationAnnotation;
import ws.palladian.extraction.location.LocationSource;
import ws.palladian.extraction.location.PalladianLocationExtractor;
import ws.palladian.extraction.location.evaluation.LocationDocument;
import ws.palladian.extraction.location.evaluation.TudLoc2013DatasetIterable;
import ws.palladian.extraction.location.persistence.LocationDatabase;
import ws.palladian.helper.collection.CompositeIterator;
import ws.palladian.helper.geo.GeoCoordinate;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.math.MathHelper;
import ws.palladian.persistence.DatabaseManagerFactory;

/* loaded from: input_file:ws/palladian/extraction/location/disambiguation/FeatureBasedDisambiguationLearner.class */
public class FeatureBasedDisambiguationLearner {
    private static final Logger LOGGER = LoggerFactory.getLogger(FeatureBasedDisambiguationLearner.class);
    private static final int MAX_DISTANCE = 50;
    private final QuickDtLearner learner;
    private final LocationFeatureExtractor featureExtraction;
    private final LocationSource locationSource;
    private final ClassifyingTagger tagger;

    public FeatureBasedDisambiguationLearner(LocationSource locationSource, ClassifyingTagger classifyingTagger, int i, LocationFeatureExtractor locationFeatureExtractor) {
        Validate.notNull(locationSource, "locationSource must not be null", new Object[0]);
        this.locationSource = locationSource;
        this.tagger = classifyingTagger;
        this.learner = QuickDtLearner.randomForest(i);
        this.featureExtraction = locationFeatureExtractor;
    }

    public QuickDtModel learn(File file) {
        return learn(new TudLoc2013DatasetIterable(file).iterator());
    }

    public QuickDtModel learn(File... fileArr) {
        Validate.notNull(fileArr, "datasetDirectories must not be null", new Object[0]);
        ArrayList arrayList = new ArrayList();
        for (File file : fileArr) {
            arrayList.add(new TudLoc2013DatasetIterable(file).iterator());
        }
        return learn((Iterator<LocationDocument>) new CompositeIterator(arrayList));
    }

    public QuickDtModel learn(Iterator<LocationDocument> it) {
        return this.learner.train(createTrainingData(it));
    }

    public Set<Instance> createTrainingData(Iterator<LocationDocument> it) {
        HashSet hashSet = new HashSet();
        while (it.hasNext()) {
            LocationDocument next = it.next();
            String text = next.getText();
            hashSet.addAll(createTrainData(this.featureExtraction.extract(text, PalladianLocationExtractor.fetchLocations(this.locationSource, this.tagger.getAnnotations(text))), next.getAnnotations()));
        }
        return hashSet;
    }

    private Set<Instance> createTrainData(Set<ClassifiableLocation> set, List<LocationAnnotation> list) {
        HashSet hashSet = new HashSet();
        int i = 0;
        for (ClassifiableLocation classifiableLocation : set) {
            boolean z = false;
            Iterator<LocationAnnotation> it = list.iterator();
            while (true) {
                if (it.hasNext()) {
                    LocationAnnotation next = it.next();
                    Location location = classifiableLocation.getLocation();
                    if (location.getCoordinate() != null) {
                        Location location2 = next.getLocation();
                        GeoCoordinate coordinate = location2.getCoordinate();
                        boolean z2 = coordinate != null && location.getCoordinate().distance(coordinate) < 50.0d;
                        boolean commonName = location.commonName(location2);
                        boolean equals = location.getType().equals(location2.getType());
                        if (z2 && commonName && equals) {
                            i++;
                            z = true;
                            break;
                        }
                    }
                }
            }
            hashSet.add(new InstanceBuilder().add(classifiableLocation.getFeatureVector()).create(z));
        }
        LOGGER.debug("{} positive instances in {} ({}%)", new Object[]{Integer.valueOf(i), Integer.valueOf(set.size()), Double.valueOf(MathHelper.round((i / set.size()) * 100.0f, 2))});
        return hashSet;
    }

    public static void main(String[] strArr) throws IOException {
        FeatureBasedDisambiguationLearner featureBasedDisambiguationLearner = new FeatureBasedDisambiguationLearner(DatabaseManagerFactory.create(LocationDatabase.class, "locations"), DefaultCandidateExtractor.INSTANCE, 100, new ConfigurableFeatureExtractor());
        File file = new File("/Users/pk/Dropbox/Uni/Datasets/TUD-Loc-2013/TUD-Loc-2013_V2/1-training");
        File file2 = new File("/Users/pk/Dropbox/Uni/Dissertation_LocationLab/LGL-converted/1-train");
        File file3 = new File("/Users/pk/Dropbox/Uni/Dissertation_LocationLab/CLUST-converted/1-train");
        featureBasedDisambiguationLearner.learn(file);
        featureBasedDisambiguationLearner.learn(file2);
        featureBasedDisambiguationLearner.learn(file3);
        FileHelper.serialize(featureBasedDisambiguationLearner.learn(file, file2, file3), "locationDisambiguationModel.ser.gz");
    }
}
