package ws.palladian.retrieval.analysis;

import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Consumer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import ws.palladian.helper.ProgressMonitor;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.collection.Bag;
import ws.palladian.helper.constants.SizeUnit;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.retrieval.DocumentRetriever;
import ws.palladian.retrieval.HttpResult;
import ws.palladian.retrieval.HttpRetriever;
import ws.palladian.retrieval.HttpRetrieverFactory;
import ws.palladian.retrieval.ranking.RankingServiceException;
import ws.palladian.retrieval.ranking.services.SemRush;

/* loaded from: input_file:ws/palladian/retrieval/analysis/SitemapAnalyzer.class */
public class SitemapAnalyzer {
    private static final Logger LOGGER = LoggerFactory.getLogger(SitemapAnalyzer.class);
    private int numThreads = 10;
    private final ConcurrentHashMap<String, Map<String, Object>> resultTable = new ConcurrentHashMap<>();
    private final Bag<String> internalInboundLinkMap = Bag.create();

    public int getNumThreads() {
        return this.numThreads;
    }

    public void setNumThreads(int i) {
        this.numThreads = i;
    }

    public void analyzeSitemap(String str, String str2) {
        StopWatch stopWatch = new StopWatch();
        LOGGER.info("getting the page urls");
        Set<String> urls = new SitemapRetriever().getUrls(str);
        final AtomicInteger atomicInteger = new AtomicInteger(1);
        final ProgressMonitor progressMonitor = new ProgressMonitor(urls.size());
        Consumer<Document> consumer = new Consumer<Document>() { // from class: ws.palladian.retrieval.analysis.SitemapAnalyzer.1
            @Override // java.util.function.Consumer
            public void accept(Document document) {
                HashMap hashMap = new HashMap();
                Set<String> links = HtmlHelper.getLinks(document, true, false);
                Set links2 = HtmlHelper.getLinks(document, false, true);
                synchronized (SitemapAnalyzer.this.internalInboundLinkMap) {
                    for (String str3 : links) {
                        if (!str3.equalsIgnoreCase(document.getDocumentURI())) {
                            SitemapAnalyzer.this.internalInboundLinkMap.add(str3);
                        }
                    }
                }
                try {
                    hashMap.put("accessible", Boolean.valueOf(((HttpResult) document.getUserData(DocumentRetriever.HTTP_RESULT_KEY)).getStatusCode() < 400));
                } catch (Exception e) {
                }
                int countWords = StringHelper.countWords(HtmlHelper.stripHtmlTags(HtmlHelper.getInnerXml(document)));
                Number number = null;
                try {
                    number = new SemRush().getRanking(document.getDocumentURI()).getValues().get(SemRush.BACKLINKS_PAGE);
                } catch (RankingServiceException e2) {
                    SitemapAnalyzer.LOGGER.error("Error retrieving ranking: " + e2.getMessage(), e2);
                }
                hashMap.put("in-ext", number);
                hashMap.put("out-int", Integer.valueOf(links.size()));
                hashMap.put("out-ext", Integer.valueOf(links2.size()));
                hashMap.put("#words", Integer.valueOf(countWords));
                hashMap.put("size", Long.valueOf(SizeUnit.BYTES.toKilobytes(r0.length())));
                SitemapAnalyzer.this.resultTable.put(document.getDocumentURI(), hashMap);
                progressMonitor.incrementAndPrintProgress();
                atomicInteger.incrementAndGet();
            }
        };
        LOGGER.info("starting to process each page (" + urls.size() + " in total), time elapsed: " + stopWatch.getElapsedTimeString());
        HttpRetriever httpRetriever = HttpRetrieverFactory.getHttpRetriever();
        httpRetriever.setConnectionTimeout((int) TimeUnit.SECONDS.toMillis(120L));
        httpRetriever.setSocketTimeout((int) TimeUnit.SECONDS.toMillis(120L));
        DocumentRetriever documentRetriever = new DocumentRetriever(httpRetriever);
        documentRetriever.setNumThreads(getNumThreads());
        documentRetriever.getWebDocuments(urls, consumer);
        LOGGER.info("gathering all internal inbound link information, time elapsed: " + stopWatch.getElapsedTimeString());
        for (String str3 : urls) {
            Map<String, Object> map = this.resultTable.get(str3);
            if (map != null) {
                map.put("in-int", Integer.valueOf(this.internalInboundLinkMap.count(str3)));
            }
        }
        LOGGER.info("saving the result table, time elapsed: " + stopWatch.getElapsedTimeString());
        BufferedWriter bufferedWriter = null;
        try {
            try {
                bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(str2, true), "UTF-8"));
                bufferedWriter.append((CharSequence) "page;accessible;in-int;out-int;in-ext;out-ext;#words;size KB;indexed\n");
                for (Map.Entry<String, Map<String, Object>> entry : this.resultTable.entrySet()) {
                    bufferedWriter.append((CharSequence) (entry.getKey() + ";"));
                    bufferedWriter.append((CharSequence) (entry.getValue().get("accessible") + ";"));
                    bufferedWriter.append((CharSequence) (entry.getValue().get("in-int") + ";"));
                    bufferedWriter.append((CharSequence) (entry.getValue().get("out-int") + ";"));
                    bufferedWriter.append((CharSequence) (entry.getValue().get("in-ext") + ";"));
                    bufferedWriter.append((CharSequence) (entry.getValue().get("out-ext") + ";"));
                    bufferedWriter.append((CharSequence) (entry.getValue().get("#words") + ";"));
                    bufferedWriter.append((CharSequence) (entry.getValue().get("size") + ";"));
                    bufferedWriter.append((CharSequence) "\n");
                }
                FileHelper.close(new Closeable[]{bufferedWriter});
            } catch (IOException e) {
                LOGGER.error("Exception while writing to {}", str2, e);
                FileHelper.close(new Closeable[]{bufferedWriter});
            }
        } catch (Throwable th) {
            FileHelper.close(new Closeable[]{bufferedWriter});
            throw th;
        }
    }

    public static void main(String[] strArr) {
        SitemapAnalyzer sitemapAnalyzer = new SitemapAnalyzer();
        sitemapAnalyzer.setNumThreads(10);
        sitemapAnalyzer.analyzeSitemap("http://webknox.com/sitemapIndex.xml", "sitemapAnalysis.csv");
    }
}
