package ws.palladian.retrieval;

import java.net.UnknownHostException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Consumer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import ws.palladian.helper.Callback;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.ThreadHelper;
import ws.palladian.helper.UrlHelper;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.nlp.PatternHelper;
import ws.palladian.retrieval.helper.NoThrottle;
import ws.palladian.retrieval.helper.RequestThrottle;

/* loaded from: input_file:ws/palladian/retrieval/Crawler.class */
public class Crawler {
    private static final Logger LOGGER = LoggerFactory.getLogger(Crawler.class);
    private WebDocumentRetriever documentRetriever;
    private int maxThreads;
    private int silentStopTime;
    private boolean retryFailedRetrievals;
    private final AtomicInteger threadCount;
    private boolean trackLinks;
    private Map<String, String> trackedLinks;
    private ExecutorService executor;
    private RequestThrottle requestThrottle;
    private Consumer<String> errorCallback;
    private boolean inDomain;
    private boolean outDomain;
    private boolean subDomain;
    protected final Set<Pattern> whiteListUrlRegexps;
    protected final Set<Pattern> blackListUrlRegexps;
    protected final Set<String> whiteListLinkDomains;
    private final LinkedHashMap<Pattern, String> urlModificationRegexps;
    private final Set<String> urlAttributeModification;
    private int stopCount;
    private Set<String> urlStack;
    private Set<String> visitedUrls;
    private boolean stripQueryParams;
    private boolean respectNoFollow;
    private Callback crawlerCallbackOnFinish;
    private Map<String, Consumer<String>> fileTypeConsumers;

    public Crawler() {
        this.maxThreads = 10;
        this.silentStopTime = 10;
        this.retryFailedRetrievals = true;
        this.threadCount = new AtomicInteger(0);
        this.trackLinks = false;
        this.trackedLinks = Collections.synchronizedMap(new HashMap());
        this.executor = Executors.newFixedThreadPool(this.maxThreads);
        this.requestThrottle = NoThrottle.INSTANCE;
        this.inDomain = true;
        this.outDomain = true;
        this.subDomain = false;
        this.whiteListUrlRegexps = new HashSet();
        this.blackListUrlRegexps = new HashSet();
        this.whiteListLinkDomains = new HashSet();
        this.urlModificationRegexps = new LinkedHashMap<>();
        this.urlAttributeModification = new HashSet();
        this.stopCount = -1;
        this.urlStack = Collections.synchronizedSet(new HashSet());
        this.visitedUrls = Collections.synchronizedSet(new HashSet());
        this.stripQueryParams = true;
        this.respectNoFollow = false;
        this.crawlerCallbackOnFinish = null;
        this.fileTypeConsumers = null;
        this.documentRetriever = new DocumentRetriever();
    }

    public Crawler(WebDocumentRetriever webDocumentRetriever) {
        this.maxThreads = 10;
        this.silentStopTime = 10;
        this.retryFailedRetrievals = true;
        this.threadCount = new AtomicInteger(0);
        this.trackLinks = false;
        this.trackedLinks = Collections.synchronizedMap(new HashMap());
        this.executor = Executors.newFixedThreadPool(this.maxThreads);
        this.requestThrottle = NoThrottle.INSTANCE;
        this.inDomain = true;
        this.outDomain = true;
        this.subDomain = false;
        this.whiteListUrlRegexps = new HashSet();
        this.blackListUrlRegexps = new HashSet();
        this.whiteListLinkDomains = new HashSet();
        this.urlModificationRegexps = new LinkedHashMap<>();
        this.urlAttributeModification = new HashSet();
        this.stopCount = -1;
        this.urlStack = Collections.synchronizedSet(new HashSet());
        this.visitedUrls = Collections.synchronizedSet(new HashSet());
        this.stripQueryParams = true;
        this.respectNoFollow = false;
        this.crawlerCallbackOnFinish = null;
        this.fileTypeConsumers = null;
        this.documentRetriever = webDocumentRetriever;
    }

    public RequestThrottle getRequestThrottle() {
        return this.requestThrottle;
    }

    public void setRequestThrottle(RequestThrottle requestThrottle) {
        this.requestThrottle = requestThrottle;
    }

    public boolean validate(String str) {
        return true;
    }

    protected void crawl(String str) {
        Consumer<String> consumer;
        LOGGER.info("catch from stack: {}", str);
        this.requestThrottle.hold();
        if (getFileTypeConsumers() != null && (consumer = getFileTypeConsumers().get(FileHelper.getFileType(str))) != null) {
            consumer.accept(str);
            return;
        }
        WebDocumentRetriever documentRetriever = getDocumentRetriever();
        Document webDocument = documentRetriever.getWebDocument(str);
        if (webDocument != null) {
            Set<String> links = HtmlHelper.getLinks(webDocument, webDocument.getDocumentURI(), this.inDomain, this.outDomain, "", this.respectNoFollow, this.subDomain, this.urlAttributeModification);
            if (!this.whiteListLinkDomains.isEmpty()) {
                Set links2 = HtmlHelper.getLinks(webDocument, webDocument.getDocumentURI(), false, true, "", false, this.subDomain, this.urlAttributeModification);
                for (String str2 : this.whiteListLinkDomains) {
                    links.addAll((List) links2.stream().filter(str3 -> {
                        return str3.contains(str2);
                    }).collect(Collectors.toList()));
                }
            }
            if (this.urlStack.isEmpty() || this.visitedUrls.isEmpty() || (System.currentTimeMillis() / 1000) % 5 == 0) {
                LOGGER.info("retrieved {} links from {} || stack size: {}, visited: {}", new Object[]{Integer.valueOf(links.size()), str, Integer.valueOf(this.urlStack.size()), Integer.valueOf(this.visitedUrls.size())});
            }
            addUrlsToStack(links, str);
        } else if (isRetryFailedRetrievals() && documentRetriever.getDownloadFilter().test(str)) {
            LOGGER.error("could not get " + str + ", putting it back on the stack for later");
            addUrlToStack(str, str);
        }
        release(documentRetriever);
    }

    public void setSilentStopTime(int i) {
        this.silentStopTime = i;
    }

    public void stopCrawl() {
        setStopCount(0);
    }

    private void startCrawl() {
        final AtomicLong atomicLong = new AtomicLong(System.currentTimeMillis());
        long millis = TimeUnit.MINUTES.toMillis(this.silentStopTime);
        while (true) {
            if ((this.stopCount == -1 || this.visitedUrls.size() < this.stopCount) && System.currentTimeMillis() - atomicLong.get() < millis) {
                try {
                    final String urlFromStack = getUrlFromStack();
                    if (urlFromStack != null) {
                        Thread thread = new Thread("CrawlThread-" + urlFromStack) { // from class: ws.palladian.retrieval.Crawler.1
                            @Override // java.lang.Thread, java.lang.Runnable
                            public void run() {
                                try {
                                    if (Crawler.this.stopCount == 0) {
                                        return;
                                    }
                                    Crawler.this.crawl(urlFromStack);
                                    atomicLong.set(System.currentTimeMillis());
                                } catch (Throwable th) {
                                    th.printStackTrace();
                                    Crawler.LOGGER.error(th.getMessage(), th);
                                    if (Crawler.this.errorCallback != null) {
                                        Crawler.this.errorCallback.accept(urlFromStack);
                                    }
                                }
                            }
                        };
                        if (!this.executor.isShutdown()) {
                            this.executor.submit(thread);
                        }
                    } else {
                        ThreadHelper.deepSleep(1000);
                    }
                } catch (Exception e) {
                    LOGGER.error(e.getMessage(), e);
                }
            }
        }
        this.executor.shutdown();
        LOGGER.info("waiting for all threads to finish...");
        StopWatch stopWatch = new StopWatch();
        while (!this.executor.awaitTermination(5L, TimeUnit.SECONDS)) {
            try {
                LOGGER.debug("wait crawling");
            } catch (InterruptedException e2) {
                LOGGER.error(e2.getMessage(), e2);
            }
        }
        LOGGER.info("...all threads finished in " + stopWatch.getTotalElapsedTimeString());
        if (this.crawlerCallbackOnFinish != null) {
            this.crawlerCallbackOnFinish.callback();
        }
    }

    public void startCrawl(Set<String> set, boolean z, boolean z2, boolean z3) {
        this.urlStack.clear();
        this.urlStack.addAll(set);
        this.inDomain = z;
        this.outDomain = z2;
        this.subDomain = z3;
        startCrawl();
    }

    public void startCrawl(String str, boolean z, boolean z2, boolean z3) {
        this.urlStack.clear();
        this.urlStack.add(str);
        this.inDomain = z;
        this.outDomain = z2;
        this.subDomain = z3;
        startCrawl();
    }

    private synchronized String getUrlFromStack() {
        Iterator<String> it = this.urlStack.iterator();
        if (!it.hasNext()) {
            return null;
        }
        String next = it.next();
        this.urlStack.remove(next);
        this.visitedUrls.add(next);
        return next;
    }

    public void setStopCount(int i) {
        this.stopCount = i;
    }

    public int getStopCount() {
        return this.stopCount;
    }

    public void addWhiteListRegexp(String str) {
        this.whiteListUrlRegexps.add(Pattern.compile(str));
    }

    public void addWhiteListRegexps(Set<String> set) {
        Iterator<String> it = set.iterator();
        while (it.hasNext()) {
            addWhiteListRegexp(it.next());
        }
    }

    public void addBlackListRegexp(String str) {
        this.blackListUrlRegexps.add(Pattern.compile(str));
    }

    public void addBlackListRegexps(Set<String> set) {
        Iterator<String> it = set.iterator();
        while (it.hasNext()) {
            addBlackListRegexp(it.next());
        }
    }

    public void addWhiteListLinkDomains(String str) {
        this.whiteListLinkDomains.add(str);
    }

    public Set<String> getWhiteListLinkDomains() {
        return this.whiteListLinkDomains;
    }

    public Map<Pattern, String> getUrlModificationRegexps() {
        return this.urlModificationRegexps;
    }

    public void addUrlModificationRegexps(LinkedHashMap<Pattern, String> linkedHashMap) {
        this.urlModificationRegexps.putAll(linkedHashMap);
    }

    public Set<String> getUrlAttributeModification() {
        return this.urlAttributeModification;
    }

    public void addUrlAttributeModification(String str) {
        this.urlAttributeModification.add(str);
    }

    private synchronized void addUrlsToStack(Set<String> set, String str) {
        Iterator<String> it = set.iterator();
        while (it.hasNext()) {
            addUrlToStack(it.next(), str);
        }
    }

    private String cleanUrl(String str) {
        String removeAnchors = UrlHelper.removeAnchors(UrlHelper.removeSessionId(str));
        if (isStripQueryParams()) {
            StringBuilder sb = new StringBuilder();
            Iterator<String> it = this.urlAttributeModification.iterator();
            while (it.hasNext()) {
                Matcher matcher = PatternHelper.compileOrGet("[?&]" + it.next() + "=[^&]+").matcher(removeAnchors);
                if (matcher.find()) {
                    sb.append(matcher.group());
                }
            }
            removeAnchors = removeAnchors.replaceAll("\\?.*", "") + ((Object) sb);
            if (!removeAnchors.contains("?")) {
                removeAnchors = removeAnchors.replaceFirst("&", "?");
            }
        }
        for (Map.Entry<Pattern, String> entry : this.urlModificationRegexps.entrySet()) {
            try {
                removeAnchors = entry.getKey().matcher(removeAnchors).replaceAll(entry.getValue());
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return removeAnchors;
    }

    protected synchronized void addUrlToStack(String str, String str2) {
        String cleanUrl = cleanUrl(str);
        if (cleanUrl == null || cleanUrl.length() >= 400 || this.visitedUrls.contains(cleanUrl) || !this.documentRetriever.getDownloadFilter().test(cleanUrl) || !validate(cleanUrl)) {
            return;
        }
        boolean z = true;
        if (!this.whiteListUrlRegexps.isEmpty()) {
            z = false;
            Iterator<Pattern> it = this.whiteListUrlRegexps.iterator();
            while (true) {
                if (!it.hasNext()) {
                    break;
                } else if (it.next().matcher(cleanUrl).find()) {
                    z = true;
                    break;
                }
            }
        }
        if (!this.blackListUrlRegexps.isEmpty()) {
            Iterator<Pattern> it2 = this.blackListUrlRegexps.iterator();
            while (true) {
                if (!it2.hasNext()) {
                    break;
                } else if (it2.next().matcher(cleanUrl).find()) {
                    z = false;
                    break;
                }
            }
        }
        if (z) {
            this.urlStack.add(cleanUrl);
            if (this.trackLinks) {
                this.trackedLinks.put(cleanUrl, str2);
            }
        }
    }

    public int getMaxThreads() {
        return this.maxThreads;
    }

    public void setMaxThreads(int i) {
        this.maxThreads = i;
        this.executor = Executors.newFixedThreadPool(i);
    }

    public int getThreadCount() {
        return this.threadCount.get();
    }

    public Callback getCrawlerCallbackOnFinish() {
        return this.crawlerCallbackOnFinish;
    }

    public void setCrawlerCallbackOnFinish(Callback callback) {
        this.crawlerCallbackOnFinish = callback;
    }

    public void addCrawlerCallback(Consumer<Document> consumer) {
        this.documentRetriever.addRetrieverCallback(consumer);
    }

    public Map<String, Consumer<String>> getFileTypeConsumers() {
        return this.fileTypeConsumers;
    }

    public void setFileTypeConsumers(Map<String, Consumer<String>> map) {
        this.fileTypeConsumers = map;
    }

    public WebDocumentRetriever getDocumentRetriever() {
        return this.documentRetriever;
    }

    public void setDocumentRetriever(WebDocumentRetriever webDocumentRetriever) {
        this.documentRetriever = webDocumentRetriever;
    }

    public void release(WebDocumentRetriever webDocumentRetriever) {
    }

    public boolean isStripQueryParams() {
        return this.stripQueryParams;
    }

    public boolean isRespectNoFollow() {
        return this.respectNoFollow;
    }

    public void setRespectNoFollow(boolean z) {
        this.respectNoFollow = z;
    }

    public void setStripQueryParams(boolean z) {
        this.stripQueryParams = z;
    }

    public Set<String> getUrlStack() {
        return this.urlStack;
    }

    public void setUrlStack(Set<String> set) {
        this.urlStack = set;
    }

    public Set<String> getVisitedUrls() {
        return this.visitedUrls;
    }

    public void setVisitedUrls(Set<String> set) {
        this.visitedUrls = set;
    }

    public boolean isRetryFailedRetrievals() {
        return this.retryFailedRetrievals;
    }

    public void setRetryFailedRetrievals(boolean z) {
        this.retryFailedRetrievals = z;
    }

    public boolean isTrackLinks() {
        return this.trackLinks;
    }

    public void setTrackLinks(boolean z) {
        this.trackLinks = z;
    }

    public Map<String, String> getTrackedLinks() {
        return this.trackedLinks;
    }

    public void setTrackedLinks(Map<String, String> map) {
        this.trackedLinks = map;
    }

    public Consumer<String> getErrorCallback() {
        return this.errorCallback;
    }

    public void setErrorCallback(Consumer<String> consumer) {
        this.errorCallback = consumer;
    }

    public static void main(String[] strArr) throws UnknownHostException {
        Crawler crawler = new Crawler();
        crawler.addCrawlerCallback(new Consumer<Document>() { // from class: ws.palladian.retrieval.Crawler.2
            @Override // java.util.function.Consumer
            public void accept(Document document) {
                Crawler.LOGGER.info("downloaded the page " + document.getDocumentURI());
            }
        });
        crawler.setStopCount(1000);
        crawler.setMaxThreads(1);
        crawler.startCrawl("http://www.dmoz.org/", true, true, true);
    }
}
