package ws.palladian.extraction.entity.dataset;

import java.io.Closeable;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import ws.palladian.core.Annotation;
import ws.palladian.core.Instance;
import ws.palladian.extraction.content.ReadabilityContentExtractor;
import ws.palladian.extraction.entity.FileFormatParser;
import ws.palladian.helper.ProgressMonitor;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.UrlHelper;
import ws.palladian.helper.collection.Bag;
import ws.palladian.helper.constants.Language;
import ws.palladian.helper.constants.SizeUnit;
import ws.palladian.helper.date.DateHelper;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.math.MathHelper;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.retrieval.DocumentRetriever;
import ws.palladian.retrieval.DownloadFilter;
import ws.palladian.retrieval.HttpRetriever;
import ws.palladian.retrieval.resources.WebContent;
import ws.palladian.retrieval.search.Searcher;
import ws.palladian.retrieval.search.SearcherException;
import ws.palladian.semantics.WordTransformer;

/* loaded from: input_file:ws/palladian/extraction/entity/dataset/DatasetCreator.class */
public class DatasetCreator {
    private static final Logger LOGGER = LoggerFactory.getLogger(DatasetCreator.class);
    private final int mentionsPerSeed;
    private final int seedsPerConcept;
    private final Searcher<WebContent> searcher;
    private Map<String, List<String>> conceptSeeds;
    private final DownloadFilter downloadFilter;
    private final File datasetLocation;
    private final boolean queryWithConceptName;

    public DatasetCreator(File file, Searcher<WebContent> searcher, int i, int i2, boolean z) {
        Validate.notNull(file, "datasetLocation must not be null", new Object[0]);
        if (!file.exists() && !file.mkdirs()) {
            throw new IllegalStateException("Could not create directory " + file);
        }
        Validate.isTrue(file.isDirectory(), "datasetLocation must point to a directory", new Object[0]);
        Validate.notNull(searcher, "searcher must not be null", new Object[0]);
        Validate.isTrue(i > 0, "seedsPerConcept must be greater zero", new Object[0]);
        Validate.isTrue(i2 > 0, "mentionsPerSeed must be greater zero", new Object[0]);
        this.datasetLocation = file;
        this.downloadFilter = new DownloadFilter();
        this.downloadFilter.setExcludeFileTypes(FileHelper.BINARY_FILE_EXTENSIONS);
        this.searcher = searcher;
        this.seedsPerConcept = i;
        this.mentionsPerSeed = i2;
        this.queryWithConceptName = z;
    }

    private final void createDataset(File file) {
        StopWatch stopWatch = new StopWatch();
        this.conceptSeeds = new HashMap();
        File[] files = FileHelper.getFiles(file.getPath());
        HashSet hashSet = new HashSet();
        for (File file2 : files) {
            String fileName = FileHelper.getFileName(file2.getName());
            if (fileName.length() > 1) {
                createDatasetForConcept(fileName, file2);
                hashSet.add(getConceptNameFromFileName(fileName));
            }
        }
        writeMetaInformationFile(stopWatch, hashSet);
        LOGGER.info("created {} datasets in {}, total traffic: {} MB", new Object[]{Integer.valueOf(files.length), stopWatch, Long.valueOf(HttpRetriever.getTraffic(SizeUnit.MEGABYTES))});
    }

    private void writeMetaInformationFile(StopWatch stopWatch, Set<String> set) {
        StringBuilder sb = new StringBuilder();
        sb.append("Start Date of Creation: ").append(DateHelper.getDatetime("yyyy-MM-dd_HH-mm-ss", stopWatch.getStartTime())).append("\n");
        sb.append("Dataset created in: ").append(stopWatch.getElapsedTimeString()).append("\n");
        sb.append("Total Generated Traffic: ").append(HttpRetriever.getTraffic(SizeUnit.MEGABYTES)).append("MB\n");
        sb.append("Search Engine used: ").append(this.searcher.getName()).append("\n");
        sb.append("Minimum Mentions per Entity Targeted: ").append(this.mentionsPerSeed).append("\n");
        for (Object[] objArr : getConceptsMentions()) {
            String str = (String) objArr[0];
            String str2 = (String) objArr[1];
            if (str2.length() == 0) {
                str2 = "-";
            }
            sb.append("  Concept: ").append(str).append("\n  Entities with few mentions: ").append(str2).append("\n  Average Mentions per Entity: ").append((Double) objArr[2]).append("\n\n");
        }
        sb.append("Concepts Searched (").append(set.size()).append("):\n");
        Iterator<String> it = set.iterator();
        while (it.hasNext()) {
            sb.append("    ").append(it.next()).append("\n");
        }
        FileHelper.writeToFile(new File(this.datasetLocation, "metaInformation.txt").getPath(), sb);
    }

    private Set<Object[]> getConceptsMentions() {
        HashSet hashSet = new HashSet();
        if (this.conceptSeeds == null) {
            this.conceptSeeds = new HashMap();
            for (File file : FileHelper.getFiles(this.datasetLocation.getPath())) {
                String fileName = FileHelper.getFileName(file.getName());
                List readFileToArray = FileHelper.readFileToArray(new File(this.datasetLocation, fileName + "/seeds/seeds.txt"));
                if (!readFileToArray.isEmpty()) {
                    ArrayList arrayList = new ArrayList();
                    Iterator it = readFileToArray.iterator();
                    while (it.hasNext()) {
                        arrayList.add(((String) it.next()).split("###")[0]);
                    }
                    this.conceptSeeds.put(fileName, arrayList);
                }
            }
        }
        for (Map.Entry<String, List<String>> entry : this.conceptSeeds.entrySet()) {
            String key = entry.getKey();
            Object[] objArr = new Object[3];
            objArr[0] = key;
            File[] files = FileHelper.getFiles(new File(this.datasetLocation, key).getPath());
            Bag create = Bag.create();
            for (File file2 : files) {
                if (!file2.isDirectory()) {
                    for (String str : entry.getValue()) {
                        Matcher matcher = Pattern.compile("<.*?>\\s?" + str + "\\s?</.*?>", 8).matcher(FileHelper.tryReadFileToString(file2));
                        while (matcher.find()) {
                            create.add(str);
                        }
                    }
                }
            }
            String str2 = Instance.NO_CATEGORY_DUMMY;
            int i = 0;
            Iterator it2 = create.iterator();
            while (it2.hasNext()) {
                String str3 = (String) it2.next();
                int count = create.count(str3);
                if (count < this.mentionsPerSeed) {
                    str2 = str2 + str3 + "(" + count + "), ";
                }
                i += count;
            }
            objArr[1] = str2;
            objArr[2] = Double.valueOf(i / create.size());
            hashSet.add(objArr);
        }
        return hashSet;
    }

    private static String getConceptNameFromFileName(String str) {
        return WordTransformer.wordToSingular(str.replaceAll("_part(\\d)", Instance.NO_CATEGORY_DUMMY), Language.ENGLISH);
    }

    private void createDatasetForConcept(String str, File file) {
        LOGGER.info("Creating dataset for {}", str);
        DocumentRetriever documentRetriever = new DocumentRetriever();
        documentRetriever.setDownloadFilter(this.downloadFilter);
        List readFileToArray = FileHelper.readFileToArray(file);
        readFileToArray.remove(0);
        ArrayList arrayList = new ArrayList(MathHelper.sample(readFileToArray, this.seedsPerConcept));
        StringBuilder sb = new StringBuilder();
        ProgressMonitor progressMonitor = new ProgressMonitor();
        progressMonitor.startTask((String) null, arrayList.size());
        int i = 0;
        for (String str2 : arrayList) {
            StopWatch stopWatch = new StopWatch();
            progressMonitor.increment();
            LOGGER.info("start processing seed entity {} ({})", str2, str);
            sb.append(str2).append("###").append(getConceptNameFromFileName(str).toUpperCase()).append("\n");
            List<String> webPages = getWebPages(str2, str);
            Set<Document> webDocuments = documentRetriever.getWebDocuments(webPages);
            LOGGER.info("downloaded {} URLs for ({})", new Object[]{Integer.valueOf(webPages.size()), str2, str});
            i++;
            int i2 = 0;
            for (Document document : webDocuments) {
                if (document != null) {
                    markupWebPage(document, str, arrayList);
                    i2++;
                    LOGGER.debug("marked up page {} {}/{}, {}/{}", new Object[]{document.getDocumentURI(), Integer.valueOf(i), Integer.valueOf(arrayList.size()), Integer.valueOf(i2), Integer.valueOf(webPages.size())});
                }
            }
            LOGGER.info("processed seed entity: {} ({}) in {}", new Object[]{str2, str, stopWatch});
        }
        this.conceptSeeds.put(str, arrayList);
        FileHelper.writeToFile(new File(this.datasetLocation, str + "/seeds/seeds.txt").getPath(), sb);
        LOGGER.info("created dataset for concept {} with {} seeds", str, Integer.valueOf(arrayList.size()));
    }

    private List<String> getWebPages(String str, String str2) {
        LOGGER.info("get web pages for seed '{}' with {}", str, this.searcher);
        String str3 = "\"" + str + "\"";
        if (this.queryWithConceptName) {
            str3 = str3 + " " + str2.toLowerCase();
        }
        try {
            return this.searcher.searchUrls(str3, this.mentionsPerSeed, Language.ENGLISH);
        } catch (SearcherException e) {
            LOGGER.error("Searcher exception while searching for {}", str3, e);
            return Collections.emptyList();
        }
    }

    private void markupWebPage(Document document, String str, List<String> list) {
        LOGGER.debug("mark up web page: {} ({})", document.getDocumentURI(), str);
        String conceptNameFromFileName = getConceptNameFromFileName(str);
        try {
            String xmlToString = HtmlHelper.xmlToString(document, false);
            String resultText = new ReadabilityContentExtractor().setDocument(document).getResultText();
            if (resultText.length() < 100) {
                return;
            }
            String stripHtmlTags = HtmlHelper.stripHtmlTags(resultText);
            boolean z = false;
            for (String str2 : list) {
                try {
                    String quote = Pattern.quote(str2);
                    if ("person".equalsIgnoreCase(conceptNameFromFileName)) {
                        quote = "(" + Pattern.quote(str2) + "|" + Pattern.quote(str2.substring(str2.lastIndexOf(32) + 1, str2.length())) + ")";
                    }
                    String str3 = "(?<=\\s)" + quote + "(?![0-9A-Za-z])|(?<![0-9A-Za-z])" + quote + "(?=(\\s|[.,!?]))";
                    if (stripHtmlTags.contains(str2)) {
                        z = true;
                        xmlToString = xmlToString.replaceAll(str3, "<" + conceptNameFromFileName.toUpperCase() + " style=\"background-color:red; color:white;\">" + str2 + "</" + conceptNameFromFileName.toUpperCase() + ">");
                        stripHtmlTags = stripHtmlTags.replaceAll(str3, "<" + conceptNameFromFileName.toUpperCase() + ">" + str2 + "</" + conceptNameFromFileName.toUpperCase() + ">");
                    }
                } catch (Exception e) {
                    LOGGER.error("something went wrong marking up the page content with seed {}, {}", str2, e.getMessage());
                }
                LOGGER.debug("marked up page {} with entity {}", document.getDocumentURI(), str2);
            }
            if (xmlToString.length() > 100 && z) {
                FileHelper.writeToFile(new File(this.datasetLocation, str + "/html/" + StringHelper.makeSafeName(UrlHelper.getCleanUrl(document.getDocumentURI()), 30) + ".html").getPath(), xmlToString);
                LOGGER.debug("saved html file");
            }
            if (stripHtmlTags.length() <= 50 || !z) {
                return;
            }
            File file = new File(this.datasetLocation, str + "/" + StringHelper.makeSafeName(document.getDocumentURI(), 30) + ".xml");
            FileHelper.writeToFile(file.getPath(), stripHtmlTags);
            FileHelper.removeDuplicateLines(file.getPath(), file.getPath());
            LOGGER.debug("saved text file");
        } catch (Error e2) {
            LOGGER.error("could not extract clean content from {}: {}", document.getDocumentURI(), e2.getMessage());
        } catch (Exception e3) {
            LOGGER.error("could not extract clean content from {}: {}", document.getDocumentURI(), e3.getMessage());
        }
    }

    private static void cleanDataset(File file, File file2, boolean z) {
        StopWatch stopWatch = new StopWatch();
        File file3 = file;
        if (z) {
            file3 = new File(file3.getPath() + "_cleansed");
        }
        for (File file4 : FileHelper.getFiles(file2.getPath())) {
            String fileName = FileHelper.getFileName(file4.getName());
            String conceptNameFromFileName = getConceptNameFromFileName(fileName);
            if (fileName.length() > 1) {
                File file5 = new File(file, fileName);
                for (File file6 : FileHelper.getFiles(file5.getPath())) {
                    if (!file6.isDirectory()) {
                        String cleanText = cleanText(FileHelper.tryReadFileToString(file6), conceptNameFromFileName);
                        if (cleanText.length() > 10) {
                            File file7 = new File(file5, FileHelper.getFileName(file6.getPath()) + ".xml");
                            FileHelper.writeToFile(file7.getPath(), cleanText);
                            FileHelper.removeDuplicateLines(file7.getPath(), file7.getPath());
                            LOGGER.debug("saved cleansed text file");
                        }
                    }
                }
            }
        }
        if (z) {
            FileHelper.copyFile(new File(file, "/metaInformation.txt").getPath(), new File(file3, "/metaInformation.txt").getPath());
        }
        LOGGER.info("dataset cleansed in {}", stopWatch);
    }

    private static String cleanText(String str, String str2) {
        String str3;
        try {
            str3 = Pattern.compile("^<" + str2.toUpperCase() + ">.*?</" + str2.toUpperCase() + ">$", 8).matcher(str.replaceAll("(\n)+(.{0,80}(\n)){4,}", "\n")).replaceAll(Instance.NO_CATEGORY_DUMMY).replaceAll("(\n){3,}", "\n");
        } catch (Exception e) {
            LOGGER.error("Encountered {}", e.toString());
            str3 = Instance.NO_CATEGORY_DUMMY;
        }
        return str3;
    }

    /* JADX WARN: Finally extract failed */
    private static void postProcessDataset(File file, File file2) {
        StopWatch stopWatch = new StopWatch();
        File[] files = FileHelper.getFiles(file.getPath());
        ArrayList arrayList = new ArrayList();
        for (File file3 : files) {
            String fileName = FileHelper.getFileName(file3.getName());
            String makeCamelCase = StringHelper.makeCamelCase(WordTransformer.wordToSingular(fileName, Language.ENGLISH), true);
            if (fileName.length() != 0) {
                File file4 = new File(file2, fileName + "/combined");
                file4.mkdirs();
                File file5 = new File(file4, "all" + makeCamelCase + ".xml");
                FileWriter fileWriter = null;
                arrayList.add(file5);
                try {
                    try {
                        fileWriter = new FileWriter(file5);
                        int i = 0;
                        for (File file6 : FileHelper.getFiles(new File(file2, fileName).getPath())) {
                            if (!file6.isDirectory()) {
                                String readFileToString = FileHelper.readFileToString(file6);
                                if (readFileToString.length() >= 5) {
                                    i++;
                                    fileWriter.write("\n\n----------------------------------------------- NEW DOCUMENT (#" + i + " / " + makeCamelCase + ") -----------------------------------------------\n\n" + readFileToString);
                                    fileWriter.write("\n");
                                }
                            }
                        }
                        FileHelper.close(new Closeable[]{fileWriter});
                    } catch (IOException e) {
                        LOGGER.error("Error while writing to {}: {}", file5, e);
                        FileHelper.close(new Closeable[]{fileWriter});
                    }
                } finally {
                    FileHelper.close(new Closeable[]{fileWriter});
                }
            }
        }
        FileWriter fileWriter2 = null;
        try {
            try {
                fileWriter2 = new FileWriter(new File(file2, "all.xml"));
                Iterator it = arrayList.iterator();
                while (it.hasNext()) {
                    fileWriter2.write("\n\n----------------------------------------------- NEW CONCEPT -----------------------------------------------" + FileHelper.readFileToString((File) it.next()));
                    fileWriter2.flush();
                }
                fileWriter2.close();
                FileHelper.close(new Closeable[]{fileWriter2});
            } catch (IOException e2) {
                LOGGER.error(e2.getMessage());
                FileHelper.close(new Closeable[]{fileWriter2});
            }
            LOGGER.info("post processed dataset in {}", stopWatch);
        } catch (Throwable th) {
            FileHelper.close(new Closeable[]{fileWriter2});
            throw th;
        }
    }

    public void generateDataset(File file) {
        Validate.notNull(file, "seedFile must not be null", new Object[0]);
        Validate.isTrue(file.isFile(), "seedFile must point to a file", new Object[0]);
        generateDataset(FileFormatParser.getSeedAnnotations(file.getPath(), this.seedsPerConcept));
    }

    public void generateDataset(Collection<? extends Annotation> collection) {
        Validate.notNull(collection, "annotations must not be null", new Object[0]);
        StopWatch stopWatch = new StopWatch();
        LOGGER.info("start generating dataset with {} seeds per concept and at least {} mentions per seed", Integer.valueOf(this.seedsPerConcept), Integer.valueOf(this.mentionsPerSeed));
        HashMap hashMap = new HashMap();
        for (Annotation annotation : collection) {
            StringBuilder sb = (StringBuilder) hashMap.get(annotation.getTag());
            if (sb == null) {
                sb = new StringBuilder();
                sb.append("Seeds for ").append(annotation.getTag()).append("\n");
                hashMap.put(annotation.getTag(), sb);
            }
            sb.append(annotation.getValue()).append("\n");
        }
        File file = new File(this.datasetLocation, "seedEntities");
        file.mkdir();
        for (Map.Entry entry : hashMap.entrySet()) {
            FileHelper.writeToFile(new File(file, ((String) entry.getKey()) + ".txt").getPath(), (CharSequence) entry.getValue());
        }
        createDataset(file);
        cleanDataset(this.datasetLocation, file, false);
        postProcessDataset(file, this.datasetLocation);
        String replaceAll = Pattern.compile("(=-DOCSTART-\n?){1,}").matcher(Pattern.compile("^((?!<[A-Z]{1,20}?>).)*$", 8).matcher(FileHelper.tryReadFileToString(new File(this.datasetLocation, "all.xml")).replaceAll("-+ NEW CONCEPT.*", Instance.NO_CATEGORY_DUMMY).replaceAll("-+ NEW DOCUMENT .#.*", "=-<DOCSTART>-")).replaceAll(Instance.NO_CATEGORY_DUMMY).replace("=-<DOCSTART>-", "=-DOCSTART-").replaceAll("(\n){3,}", "\n")).replaceAll("\n=-DOCSTART-\n\n").replaceAll("(\n){3,}", "\n");
        String path = new File(this.datasetLocation, "allCleansed.xml").getPath();
        FileHelper.writeToFile(path, replaceAll);
        String path2 = new File(this.datasetLocation, "allColumn.txt").getPath();
        FileFormatParser.xmlToColumn(path, path2, "\t");
        FileHelper.writeToFile(path2, FileHelper.tryReadFileToString(path2).replaceAll("=-\tO\nDOCSTART\tO\n-\tO", "=-DOCSTART-\tO"));
        LOGGER.info("generated dataset in {}", stopWatch);
    }

    public static void generateDatasets(File file, Searcher<WebContent> searcher, File file2, int i, int i2, int i3) {
        Validate.notNull(file, "datasetLocation must not be null", new Object[0]);
        if (!file.exists() && !file.mkdirs()) {
            throw new IllegalStateException("Could not create directory " + file);
        }
        Validate.isTrue(file.isDirectory(), "datasetLocation must point to a directory", new Object[0]);
        Validate.notNull(searcher, "searcher must not be null", new Object[0]);
        Validate.isTrue(i > 0, "minSeeds must be greater zero", new Object[0]);
        Validate.isTrue(i2 > i, "maxSeeds must be greater minSeeds", new Object[0]);
        Validate.isTrue(i3 > 0, "mentionsPerSeed must be greater zero", new Object[0]);
        for (int i4 = i; i4 <= i2; i4++) {
            File file3 = new File(file, String.valueOf(i4));
            new DatasetCreator(file3, searcher, i4, i3, true).generateDataset(file2);
            FileHelper.copyFile(new File(file3, "allColumn.txt").getPath(), new File(file, "seedsTest" + i4 + ".txt").getPath());
        }
    }

    public static void main(String[] strArr) {
    }
}
