package ws.palladian.extraction.entity.dataset;

import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.text.DecimalFormat;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ws.palladian.core.Annotation;
import ws.palladian.core.AnnotationFilters;
import ws.palladian.core.Instance;
import ws.palladian.extraction.DictionaryTagger;
import ws.palladian.extraction.entity.FileFormatParser;
import ws.palladian.extraction.entity.TaggingFormat;
import ws.palladian.extraction.entity.tagger.NerHelper;
import ws.palladian.helper.ProcessHelper;
import ws.palladian.helper.ProgressMonitor;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.collection.LruMap;
import ws.palladian.helper.constants.Language;
import ws.palladian.helper.constants.SizeUnit;
import ws.palladian.helper.functional.Predicates;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.math.MathHelper;
import ws.palladian.retrieval.wiki.InfoboxTypeMapper;
import ws.palladian.retrieval.wiki.MediaWikiDescriptor;
import ws.palladian.retrieval.wiki.MediaWikiUtil;
import ws.palladian.retrieval.wiki.WikiLink;
import ws.palladian.retrieval.wiki.WikiPage;

/* loaded from: input_file:ws/palladian/extraction/entity/dataset/WikipediaDatasetCreator.class */
class WikipediaDatasetCreator {
    private static final String IGNORE_TAG = "*IGNORE*";
    private static final String NO_MAPPED_TYPE = "NONE";
    private static final Logger LOGGER = LoggerFactory.getLogger(WikipediaDatasetCreator.class);
    private static final MediaWikiDescriptor descriptor = MediaWikiDescriptor.Builder.wikipedia().language(Language.ENGLISH).m244create();
    private static final int ARTICLE_CACHE_SIZE = 10000;
    private static final LruMap<String, WikiPage> ARTICLE_CACHE = LruMap.accessOrder(ARTICLE_CACHE_SIZE);

    WikipediaDatasetCreator() {
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static String process(WikiPage wikiPage) {
        String conLLType;
        Map<String, String> resolveLinkedEntities = resolveLinkedEntities(descriptor, wikiPage.getLinks());
        String infoboxType = wikiPage.getInfoboxType();
        if (infoboxType != null && !infoboxType.isEmpty() && (conLLType = InfoboxTypeMapper.getConLLType(infoboxType)) != null && !conLLType.isEmpty()) {
            Iterator<String> it = wikiPage.getAlternativeTitles().iterator();
            while (it.hasNext()) {
                resolveLinkedEntities.put(it.next().replaceAll(",$", Instance.NO_CATEGORY_DUMMY), conLLType);
            }
            resolveLinkedEntities.put(wikiPage.getTitle(), conLLType);
            resolveLinkedEntities.put(wikiPage.getCleanTitle(), conLLType);
        }
        HashMap hashMap = new HashMap();
        HashSet hashSet = new HashSet();
        for (Map.Entry<String, String> entry : resolveLinkedEntities.entrySet()) {
            String key = entry.getKey();
            String value = entry.getValue();
            if ("PER".equals(value)) {
                key = cleanPersonName(key);
                hashMap.put(key.substring(key.lastIndexOf(" ") + 1), value);
                String[] split = key.split("\\s");
                if (split.length == 3) {
                    hashMap.put(split[0] + " " + split[2], value);
                }
            }
            if ("ORG".equals(value)) {
                hashMap.put(key.replaceAll(",? Inc.", Instance.NO_CATEGORY_DUMMY), value);
            }
            if ("LOC".equals(value) && key.contains(", ")) {
                LOGGER.debug("Splitting LOC '{}'", key);
                String[] split2 = key.split(", ");
                hashMap.put(split2[0], value);
                hashMap.put(split2[1], value);
                hashSet.add(key);
            }
        }
        LOGGER.debug("Generated {} additional entries", Integer.valueOf(hashMap.size()));
        LOGGER.debug("Removing {} entries", Integer.valueOf(hashSet.size()));
        resolveLinkedEntities.putAll(hashMap);
        resolveLinkedEntities.keySet().removeAll(hashSet);
        resolveLinkedEntities.remove(Instance.NO_CATEGORY_DUMMY);
        List<Annotation> annotations = new DictionaryTagger(resolveLinkedEntities, true).getAnnotations(wikiPage.getCleanText());
        LOGGER.debug("Removed {} ignored/unknown types", Integer.valueOf(CollectionHelper.remove(annotations, Predicates.not(AnnotationFilters.tag(IGNORE_TAG)))));
        return NerHelper.tag(wikiPage.getCleanText(), annotations, TaggingFormat.XML);
    }

    static String cleanPersonName(String str) {
        return str.replaceAll("\\s\\([^)]*\\)", Instance.NO_CATEGORY_DUMMY).replaceAll(",?\\s(Jr|Sr)\\.", Instance.NO_CATEGORY_DUMMY).trim();
    }

    static double getUcTokenPercentage(String str) {
        int i = 0;
        int i2 = 0;
        for (String str2 : str.split("\\s")) {
            if (str2.length() > 0) {
                char charAt = str2.charAt(0);
                if (Character.isUpperCase(charAt)) {
                    i++;
                } else if (Character.isLowerCase(charAt)) {
                    i2++;
                }
            }
        }
        return i / (i + i2);
    }

    private static Map<String, String> resolveLinkedEntities(MediaWikiDescriptor mediaWikiDescriptor, List<WikiLink> list) {
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        for (WikiLink wikiLink : list) {
            String destination = wikiLink.getDestination();
            if (!linkedHashMap.containsKey(destination) && !destination.isEmpty() && !destination.startsWith("file")) {
                String title = wikiLink.getTitle();
                if (title == null || title.isEmpty()) {
                    title = destination;
                }
                if (getUcTokenPercentage(title) < 0.5d) {
                    LOGGER.debug("Skip '{}' because of UC token percentage", title);
                } else {
                    try {
                        WikiPage retrieveArticle = retrieveArticle(destination);
                        if (retrieveArticle == null) {
                            LOGGER.debug("No article with name '{}'", destination);
                            linkedHashMap.put(destination, IGNORE_TAG);
                        } else {
                            String infoboxType = retrieveArticle.getInfoboxType();
                            if (infoboxType != null) {
                                String str = (String) CollectionHelper.coalesce(new String[]{InfoboxTypeMapper.getConLLType(infoboxType), IGNORE_TAG});
                                linkedHashMap.put(destination, str);
                                String title2 = wikiLink.getTitle();
                                if (title2 != null && title2.length() > 0) {
                                    linkedHashMap.put(title2, str);
                                }
                                for (String str2 : retrieveArticle.getAlternativeTitles()) {
                                    if (str2.length() > 1) {
                                        linkedHashMap.put(str2, str);
                                    }
                                }
                            }
                        }
                    } catch (Exception e) {
                        LOGGER.debug("Error when accessing '{}'", destination);
                        linkedHashMap.put(destination, IGNORE_TAG);
                    }
                }
            }
        }
        return linkedHashMap;
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static WikiPage retrieveArticle(String str) {
        WikiPage wikiPage;
        synchronized (ARTICLE_CACHE) {
            wikiPage = (WikiPage) ARTICLE_CACHE.get(str);
        }
        if (wikiPage != null) {
            LOGGER.debug("Cache hit for {}, cache size {}", str, Integer.valueOf(ARTICLE_CACHE.size()));
            return wikiPage;
        }
        WikiPage retrieveArticleFollowRedirects = retrieveArticleFollowRedirects(str);
        if (retrieveArticleFollowRedirects != null) {
            synchronized (ARTICLE_CACHE) {
                LOGGER.trace("Cache fail for {}", str);
                ARTICLE_CACHE.put(str, retrieveArticleFollowRedirects);
            }
        }
        return retrieveArticleFollowRedirects;
    }

    private static WikiPage retrieveArticleFollowRedirects(String str) {
        String str2 = str;
        for (int i = 0; i < 10; i++) {
            WikiPage retrieveArticle = MediaWikiUtil.retrieveArticle(descriptor, str2);
            if (retrieveArticle == null || !retrieveArticle.isRedirect()) {
                return retrieveArticle;
            }
            LOGGER.debug("Redirect {} -> {}", str2, retrieveArticle.getRedirectTitle());
            str2 = retrieveArticle.getRedirectTitle();
        }
        LOGGER.warn("Too many redirects for {}, giving up", str);
        return null;
    }

    public static void createCombinedAnnotationFiles(File file, File file2) {
        Validate.notNull(file, "datasetDirectory must not be null", new Object[0]);
        Validate.isTrue(file.isDirectory(), "datasetDirectory is not a directory", new Object[0]);
        createCombinedAnnotationFiles(FileHelper.getFiles(file, Predicates.fileExtension(new String[]{".txt"})), file2);
    }

    public static void createCombinedAnnotationFiles(Collection<File> collection, File file) {
        Validate.notNull(collection, "datasetFiles must not be null", new Object[0]);
        Validate.notNull(file, "destinationPath must not be null", new Object[0]);
        Validate.isTrue(file.isDirectory(), "destinationPath is not a directory", new Object[0]);
        ProgressMonitor progressMonitor = new ProgressMonitor();
        progressMonitor.startTask((String) null, collection.size());
        File file2 = new File(file, "annotations-combined.xml");
        file2.delete();
        BufferedWriter bufferedWriter = null;
        try {
            try {
                bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file2), "UTF-8"));
                for (File file3 : collection) {
                    progressMonitor.increment();
                    boolean z = true;
                    for (String str : FileHelper.readFileToArray(file3)) {
                        if (z) {
                            z = false;
                        } else {
                            String trim = str.trim();
                            if (!(false | trim.isEmpty() | trim.startsWith("*") | (!trim.endsWith(".")) | (!trim.contains("<")) | ((trim.length() <= 0 || Character.isUpperCase(trim.charAt(0)) || trim.charAt(0) == '<') ? false : true))) {
                                bufferedWriter.write(trim);
                                bufferedWriter.write(10);
                            }
                        }
                    }
                }
                FileHelper.close(new Closeable[]{bufferedWriter});
                File file4 = new File(file, "annotations-combined.txt");
                file4.delete();
                FileFormatParser.xmlToColumn(file2.getPath(), file4.getPath(), "\t");
            } catch (IOException e) {
                throw new IllegalStateException("IOException", e);
            }
        } catch (Throwable th) {
            FileHelper.close(new Closeable[]{bufferedWriter});
            throw th;
        }
    }

    public static void createCombinedAnnotationFilesEvaluation(File file, File file2) {
        Validate.notNull(file, "datasetDirectory must not be null", new Object[0]);
        Validate.notNull(file2, "destinationPath must not be null", new Object[0]);
        List files = FileHelper.getFiles(file, Predicates.fileExtension(new String[]{".txt"}));
        int size = files.size();
        int i = 100;
        while (true) {
            int i2 = i;
            if (i2 > size) {
                return;
            }
            Collection sample = MathHelper.sample(files, i2);
            File file3 = new File(file2, "sample-" + i2);
            file3.mkdirs();
            LOGGER.info("Creating sampling for size {}", Integer.valueOf(sample.size()));
            createCombinedAnnotationFiles((Collection<File>) sample, file3);
            i = i2 * 2;
        }
    }

    /* JADX WARN: Type inference failed for: r0v15, types: [ws.palladian.extraction.entity.dataset.WikipediaDatasetCreator$1] */
    public static void mineWikipedia(int i, final File file) {
        Validate.isTrue(i > 0, "numThreads must be greater zero", new Object[0]);
        Validate.notNull(file, "destinationPath must not be null", new Object[0]);
        if (!file.isDirectory()) {
            Validate.isTrue(file.mkdirs(), "destinationPath did not exist and could not be created", new Object[0]);
        }
        Validate.isTrue(ProcessHelper.getFreeMemory() > SizeUnit.MEGABYTES.toBytes(750L), "assign at least 1 GB heap memory (necessary for caching", new Object[0]);
        final AtomicInteger atomicInteger = new AtomicInteger();
        final long currentTimeMillis = System.currentTimeMillis();
        for (int i2 = 0; i2 < i; i2++) {
            new Thread() { // from class: ws.palladian.extraction.entity.dataset.WikipediaDatasetCreator.1
                @Override // java.lang.Thread, java.lang.Runnable
                public void run() {
                    while (true) {
                        try {
                            try {
                                WikiPage retrieveArticle = WikipediaDatasetCreator.retrieveArticle(MediaWikiUtil.retrieveRandomArticle(WikipediaDatasetCreator.descriptor).getTitle());
                                String process = WikipediaDatasetCreator.process(retrieveArticle);
                                if (process != null) {
                                    FileHelper.writeToFile(new File(new File(file, (String) CollectionHelper.coalesce(new String[]{InfoboxTypeMapper.getConLLType(retrieveArticle.getInfoboxType()), WikipediaDatasetCreator.NO_MAPPED_TYPE})), retrieveArticle.getTitle().replaceAll("\\s", "_").replace(';', '_').replace('/', '_').replaceAll("_+", "_") + ".txt").getPath(), process);
                                    int incrementAndGet = atomicInteger.incrementAndGet();
                                    if (incrementAndGet % 10 == 0) {
                                        WikipediaDatasetCreator.LOGGER.info("Processed {} articles, throughput: ~ {} articles/hour", atomicInteger, new DecimalFormat("##").format((TimeUnit.HOURS.toMillis(1L) * incrementAndGet) / (System.currentTimeMillis() - currentTimeMillis)));
                                    }
                                }
                            } catch (Exception e) {
                                WikipediaDatasetCreator.LOGGER.debug("Error when trying to get article");
                            }
                        } catch (Exception e2) {
                            WikipediaDatasetCreator.LOGGER.debug("Exception in {}", getName(), e2);
                        }
                    }
                }
            }.start();
        }
    }

    public static void main(String[] strArr) {
        createCombinedAnnotationFiles(new File("/Users/pk/temp/Wikipedia-EN-entity-dataset-1412853485035"), new File("/Users/pk/temp"));
    }
}
