package ws.palladian.retrieval.feeds.discovery;

import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.UrlHelper;
import ws.palladian.helper.constants.Language;
import ws.palladian.helper.html.XPathHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.retrieval.HttpRetriever;
import ws.palladian.retrieval.HttpRetrieverFactory;
import ws.palladian.retrieval.feeds.discovery.DiscoveredFeed;
import ws.palladian.retrieval.parser.DocumentParser;
import ws.palladian.retrieval.parser.ParserException;
import ws.palladian.retrieval.parser.ParserFactory;
import ws.palladian.retrieval.search.Searcher;
import ws.palladian.retrieval.search.SearcherException;

/* loaded from: input_file:ws/palladian/retrieval/feeds/discovery/FeedDiscovery.class */
public final class FeedDiscovery {
    private static final String FEED_XPATH = "//link[contains(translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'alternate') and (translate(@type, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')='application/atom+xml' or translate(@type, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')='application/rss+xml')]";
    private final Searcher<?> searcher;
    private final int numThreads;
    private final File resultFilePath;
    private StopWatch stopWatch;
    private final int numResults;
    private final boolean csvOutput;
    private static final Logger LOGGER = LoggerFactory.getLogger(FeedDiscovery.class);
    private static final HttpRetriever httpRetriever = HttpRetrieverFactory.getHttpRetriever();
    private static final DocumentParser parser = ParserFactory.createHtmlParser();
    private final BlockingQueue<String> urlQueue = new LinkedBlockingQueue();
    private final BlockingQueue<String> queryQueue = new LinkedBlockingQueue();
    private final AtomicInteger feedCounter = new AtomicInteger();
    private final AtomicInteger pageCounter = new AtomicInteger();
    private final AtomicInteger errorCounter = new AtomicInteger();

    public FeedDiscovery(Searcher<?> searcher, File file, int i, int i2, boolean z) {
        Validate.notNull(searcher, "webSearcher must not be null", new Object[0]);
        Validate.isTrue(i > 0, "numThreads must be greater zero", new Object[0]);
        Validate.isTrue(i2 > 0, "numResults must be greater zero", new Object[0]);
        this.searcher = searcher;
        this.resultFilePath = file;
        this.numThreads = i;
        this.numResults = i2;
        this.csvOutput = z;
    }

    /* JADX INFO: Access modifiers changed from: private */
    public Set<String> searchSites(String str, int i) {
        HashSet hashSet = new HashSet();
        try {
            Iterator<String> it = this.searcher.searchUrls(str, i, Language.ENGLISH).iterator();
            while (it.hasNext()) {
                hashSet.add(UrlHelper.getDomain(it.next()));
            }
        } catch (SearcherException e) {
            LOGGER.error("Searcher Exception: {}", e.getMessage());
        }
        return hashSet;
    }

    public static List<DiscoveredFeed> discoverFeeds(String str) {
        List<DiscoveredFeed> list = null;
        Document document = null;
        try {
            document = parser.parse(httpRetriever.httpGet(str));
        } catch (Throwable th) {
            LOGGER.error("Error retrieving {} : {} ; {}", new Object[]{str, th.toString(), th.getMessage()});
        }
        if (document != null) {
            list = discoverFeeds(document);
        }
        return list;
    }

    public static List<DiscoveredFeed> discoverFeeds(File file) {
        List<DiscoveredFeed> list = null;
        try {
            list = discoverFeeds(parser.parse(file));
        } catch (ParserException e) {
            LOGGER.error("Error parsing file {}", file, e);
        }
        return list;
    }

    public static List<DiscoveredFeed> discoverFeeds(Document document) {
        LinkedList linkedList = new LinkedList();
        String documentURI = document.getDocumentURI();
        String baseUrl = UrlHelper.getBaseUrl(document);
        Iterator it = XPathHelper.getXhtmlNodes(document, FEED_XPATH).iterator();
        while (it.hasNext()) {
            NamedNodeMap attributes = ((Node) it.next()).getAttributes();
            Node namedItem = attributes.getNamedItem("href");
            if (namedItem == null) {
                LOGGER.warn("href attribute is missing");
            } else {
                String nodeValue = namedItem.getNodeValue();
                if (nodeValue.isEmpty()) {
                    LOGGER.warn("href attribute is empty");
                } else {
                    String makeFullUrl = UrlHelper.makeFullUrl(documentURI, baseUrl, nodeValue.replace("feed://", "http://").replace("feed:", ""));
                    String lowerCase = attributes.getNamedItem("type").getNodeValue().toLowerCase();
                    DiscoveredFeed.Type type = null;
                    if (lowerCase.contains("atom")) {
                        type = DiscoveredFeed.Type.ATOM;
                    } else if (lowerCase.contains("rss")) {
                        type = DiscoveredFeed.Type.RSS;
                    }
                    Node namedItem2 = attributes.getNamedItem("title");
                    String str = null;
                    if (namedItem2 != null) {
                        str = namedItem2.getNodeValue();
                    }
                    linkedList.add(new DiscoveredFeed(type, makeFullUrl, str, documentURI));
                }
            }
        }
        LOGGER.debug("{} feeds for {}", Integer.valueOf(linkedList.size()), documentURI);
        return linkedList;
    }

    public void findFeeds() {
        this.stopWatch = new StopWatch();
        LOGGER.info("Start finding feeds with {} queries and {} results per query = max. {} URLs to check for feeds; number of threads = {}", new Object[]{Integer.valueOf(this.queryQueue.size()), Integer.valueOf(this.numResults), Integer.valueOf(this.numResults * this.queryQueue.size()), Integer.valueOf(this.numThreads)});
        final Object obj = new Object();
        Thread thread = new Thread() { // from class: ws.palladian.retrieval.feeds.discovery.FeedDiscovery.1
            @Override // java.lang.Thread, java.lang.Runnable
            public void run() {
                int size = FeedDiscovery.this.queryQueue.size();
                int i = 0;
                while (true) {
                    String str = (String) FeedDiscovery.this.queryQueue.poll();
                    if (str == null) {
                        FeedDiscovery.LOGGER.info("Finished queries in {}", FeedDiscovery.this.stopWatch.getElapsedTimeString());
                        synchronized (obj) {
                            obj.notify();
                        }
                        return;
                    }
                    Set searchSites = FeedDiscovery.this.searchSites(str, FeedDiscovery.this.numResults);
                    if (searchSites.size() > 0) {
                        synchronized (obj) {
                            obj.notify();
                        }
                    }
                    FeedDiscovery.this.urlQueue.addAll(searchSites);
                    i++;
                    FeedDiscovery.LOGGER.info("Queried {}/{}: '{}'; # results: {}; progress: {}%; query speed: {} queries/min", new Object[]{Integer.valueOf(i), Integer.valueOf(size), str, Integer.valueOf(searchSites.size()), Float.valueOf((100.0f * i) / size), Float.valueOf((float) TimeUnit.MINUTES.toMillis(i / FeedDiscovery.this.stopWatch.getElapsedTime()))});
                }
            }
        };
        thread.start();
        try {
            synchronized (obj) {
                obj.wait();
            }
        } catch (InterruptedException e) {
            LOGGER.warn("Encountered InterruptedException");
        }
        Thread[] threadArr = new Thread[this.numThreads];
        for (int i = 0; i < this.numThreads; i++) {
            threadArr[i] = new Thread() { // from class: ws.palladian.retrieval.feeds.discovery.FeedDiscovery.2
                @Override // java.lang.Thread, java.lang.Runnable
                public void run() {
                    while (true) {
                        if (FeedDiscovery.this.queryQueue.size() <= 0 && FeedDiscovery.this.urlQueue.size() <= 0) {
                            return;
                        }
                        String str = (String) FeedDiscovery.this.urlQueue.poll();
                        if (str == null) {
                            try {
                                Thread.sleep(1000L);
                            } catch (InterruptedException e2) {
                                FeedDiscovery.LOGGER.warn("Encountered InterruptedException");
                            }
                        } else {
                            try {
                                List<DiscoveredFeed> discoverFeeds = FeedDiscovery.discoverFeeds(str);
                                FeedDiscovery.this.writeDiscoveredFeeds(discoverFeeds);
                                if (discoverFeeds != null) {
                                    FeedDiscovery.this.feedCounter.addAndGet(discoverFeeds.size());
                                } else {
                                    FeedDiscovery.this.errorCounter.incrementAndGet();
                                }
                                if (FeedDiscovery.this.pageCounter.incrementAndGet() % 1000 == 0) {
                                    float elapsedTime = ((float) FeedDiscovery.this.stopWatch.getElapsedTime()) / ((float) TimeUnit.MINUTES.toMillis(1L));
                                    FeedDiscovery.LOGGER.info("# checked pages: {}; # discovered feeds: {}; # errors: {}; elapsed time: {}; throughput: {} pages/min; discovery speed: {} feeds/min; url queue size: {}", new Object[]{Integer.valueOf(FeedDiscovery.this.pageCounter.intValue()), Integer.valueOf(FeedDiscovery.this.feedCounter.intValue()), Integer.valueOf(FeedDiscovery.this.errorCounter.intValue()), FeedDiscovery.this.stopWatch.getElapsedTimeString(), Float.valueOf(FeedDiscovery.this.pageCounter.get() / elapsedTime), Float.valueOf(FeedDiscovery.this.feedCounter.get() / elapsedTime), Integer.valueOf(FeedDiscovery.this.urlQueue.size())});
                                }
                            } catch (Throwable th) {
                                FeedDiscovery.LOGGER.error("Encountered Exception", th);
                            }
                        }
                    }
                }
            };
            threadArr[i].start();
        }
        try {
            thread.join();
            for (Thread thread2 : threadArr) {
                thread2.join();
            }
        } catch (InterruptedException e2) {
            LOGGER.warn("Encountered InterruptedException");
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    public synchronized void writeDiscoveredFeeds(List<DiscoveredFeed> list) {
        if (list != null) {
            for (DiscoveredFeed discoveredFeed : list) {
                FileHelper.appendFile(this.resultFilePath.getPath(), (this.csvOutput ? discoveredFeed.toCsv() : discoveredFeed.getFeedLink()) + "\n");
            }
        }
    }

    public void addQuery(String str) {
        this.queryQueue.add(str);
    }

    public void addQueries(Collection<String> collection) {
        this.queryQueue.addAll(collection);
    }

    public void addQueries(String str) {
        addQueries(FileHelper.readFileToArray(str));
    }

    public void combineQueries(int i) {
        int size = this.queryQueue.size();
        ArrayList arrayList = new ArrayList(this.queryQueue);
        ArrayList arrayList2 = new ArrayList(this.queryQueue);
        Collections.shuffle(arrayList);
        int i2 = (size * (size - 1)) / 2;
        if (i != -1 && size > i) {
            arrayList2.addAll(arrayList.subList(0, i));
        } else if (i == -1 || i > i2 + size) {
            for (int i3 = 0; i3 < size; i3++) {
                for (int i4 = i3 + 1; i4 < size; i4++) {
                    arrayList2.add("\"" + ((String) arrayList.get(i3)) + "\" \"" + ((String) arrayList.get(i4)) + "\"");
                }
            }
        } else {
            Random random = new Random();
            while (arrayList2.size() < i) {
                arrayList2.add(((String) arrayList.get(random.nextInt(size))) + " " + ((String) arrayList.get(random.nextInt(size))));
            }
        }
        Collections.shuffle(arrayList2);
        this.queryQueue.clear();
        this.queryQueue.addAll(arrayList2);
    }
}
