package ws.palladian.retrieval.analysis;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import ws.palladian.helper.ProgressMonitor;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.nlp.PatternHelper;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.retrieval.DocumentRetriever;
import ws.palladian.retrieval.HttpRetriever;
import ws.palladian.retrieval.HttpRetrieverFactory;

/* loaded from: input_file:ws/palladian/retrieval/analysis/SitemapRetriever.class */
public class SitemapRetriever {
    private static final Pattern LOC_PATTERN = Pattern.compile("(?<=>)[^>]+?(?=</loc)", 34);

    public Set<String> getUrls(String str) {
        String text;
        LinkedHashSet linkedHashSet = new LinkedHashSet();
        HttpRetriever m12create = new HttpRetrieverFactory(true).m12create();
        DocumentRetriever documentRetriever = new DocumentRetriever(m12create);
        if (FileHelper.getFileType(str).equalsIgnoreCase("gz")) {
            m12create.downloadAndSave(str, "data/temp/sitemapIndex.xml.gzipped");
            FileHelper.ungzipFile("data/temp/sitemapIndex.xml.gzipped", "data/temp/sitemapIndex.xml");
            text = documentRetriever.getText("data/temp/sitemapIndex.xml");
            FileHelper.delete("data/temp/sitemapIndex.xml");
            FileHelper.delete("data/temp/sitemapIndex.xml.gzipped");
        } else {
            text = documentRetriever.getText(str);
        }
        List regexpMatches = StringHelper.getRegexpMatches(LOC_PATTERN, text);
        ProgressMonitor progressMonitor = new ProgressMonitor(regexpMatches.size(), 0.1d, "SitemapRetriever");
        int i = 1;
        Iterator it = regexpMatches.iterator();
        while (it.hasNext()) {
            String normalizeUrl = normalizeUrl((String) it.next());
            boolean z = FileHelper.getFileType(normalizeUrl).equalsIgnoreCase("gz");
            String str2 = "data/temp/sitemap" + i + ".xml.gzipped";
            String replace = str2.replace(".gzipped", "");
            m12create.downloadAndSave(normalizeUrl, str2);
            if (z) {
                FileHelper.ungzipFile(str2, replace);
            } else {
                FileHelper.copyFile(str2, replace);
            }
            String tryReadFileToString = FileHelper.tryReadFileToString(replace);
            if (tryReadFileToString != null) {
                String replaceAll = PatternHelper.compileOrGet("<loc>\\n", 2).matcher(PatternHelper.compileOrGet("\\n</loc>", 2).matcher(tryReadFileToString).replaceAll("</loc>")).replaceAll("<loc>");
                if (replaceAll != null) {
                    String[] split = replaceAll.split("\n");
                    ArrayList arrayList = new ArrayList();
                    for (String str3 : split) {
                        arrayList.addAll(StringHelper.getRegexpMatches(LOC_PATTERN, str3));
                    }
                    LinkedHashSet linkedHashSet2 = new LinkedHashSet();
                    Iterator it2 = arrayList.iterator();
                    while (it2.hasNext()) {
                        linkedHashSet2.add(normalizeUrl((String) it2.next()));
                    }
                    linkedHashSet.addAll(linkedHashSet2);
                    FileHelper.delete(str2);
                    FileHelper.delete(replace);
                    i++;
                    progressMonitor.incrementAndPrintProgress();
                }
            }
        }
        return linkedHashSet;
    }

    public List<String> readSitemap(String str) {
        return readSitemap(str, ".");
    }

    public List<String> readSitemap(String str, String str2) {
        Pattern compile = Pattern.compile(str2, 2);
        List<String> urlsFromSitemap = getUrlsFromSitemap(str);
        ArrayList arrayList = new ArrayList();
        for (String str3 : urlsFromSitemap) {
            if (compile.matcher(str3).find()) {
                arrayList.add(str3);
            }
        }
        return arrayList;
    }

    private List<String> getUrlsFromSitemap(String str) {
        ArrayList arrayList = new ArrayList();
        List regexpMatches = StringHelper.getRegexpMatches(LOC_PATTERN, new DocumentRetriever(new HttpRetrieverFactory(true).m12create()).getText(str));
        ArrayList arrayList2 = new ArrayList();
        Iterator it = regexpMatches.iterator();
        while (it.hasNext()) {
            arrayList2.add(normalizeUrl((String) it.next()));
        }
        arrayList.addAll(arrayList2);
        return arrayList;
    }

    protected String normalizeUrl(String str) {
        return str.replace("<![CDATA[", "").replace("]]>", "").trim().replace("&amp;", "&").replace("&apos;", "'").replace("&quot;", "\"").replace("&gt;", ">").replace("&lt;", "<");
    }
}
