package ws.palladian.retrieval;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
import java.util.function.Predicate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import ws.palladian.helper.ProgressMonitor;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.functional.Predicates;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.retrieval.helper.NoThrottle;
import ws.palladian.retrieval.helper.RequestThrottle;
import ws.palladian.retrieval.search.DocumentRetrievalTrial;

/* loaded from: input_file:ws/palladian/retrieval/WebDocumentRetriever.class */
public abstract class WebDocumentRetriever {
    private static final Logger LOGGER = LoggerFactory.getLogger(WebDocumentRetriever.class);
    public static final int DEFAULT_NUM_THREADS = 10;
    public static final String ORIGINAL_REQUEST_URL = "requestUrl";
    private Predicate<? super String> downloadFilter = Predicates.ALL;
    private int numThreads = 10;
    private RequestThrottle requestThrottle = NoThrottle.INSTANCE;
    private final List<Consumer<Document>> retrieverCallbacks = new ArrayList();
    private Consumer<DocumentRetrievalTrial> errorCallback = null;
    private Map<String, Consumer<String>> fileTypeConsumers = new HashMap();
    Map<String, String> globalHeaders = new HashMap();

    public abstract Document getWebDocument(String str);

    public Document getWebDocument(String str, Thread thread) {
        return getWebDocument(str);
    }

    public String getText(String str) {
        Document webDocument = getWebDocument(str);
        if (webDocument == null) {
            return null;
        }
        return HtmlHelper.getInnerXml(webDocument);
    }

    public Map<String, String> getGlobalHeaders() {
        return this.globalHeaders;
    }

    public void setGlobalHeaders(Map<String, String> map) {
        this.globalHeaders = map;
    }

    public void setDownloadFilter(Predicate<String> predicate) {
        this.downloadFilter = predicate;
    }

    public Predicate<? super String> getDownloadFilter() {
        return this.downloadFilter;
    }

    public void setNumThreads(int i) {
        this.numThreads = i;
    }

    public int getNumThreads() {
        return this.numThreads;
    }

    public RequestThrottle getRequestThrottle() {
        return this.requestThrottle;
    }

    public void setRequestThrottle(RequestThrottle requestThrottle) {
        this.requestThrottle = requestThrottle;
    }

    public void getWebDocuments(Collection<String> collection, Consumer<Document> consumer) {
        getWebDocuments(collection, consumer, new ProgressMonitor(collection.size(), 1.0d, "DocumentRetriever"));
    }

    public void getWebDocuments(Collection<String> collection, final Consumer<Document> consumer, final ProgressMonitor progressMonitor) {
        ArrayList arrayList = new ArrayList(collection);
        int i = 0;
        while (true) {
            int i2 = i;
            if (i2 >= collection.size()) {
                return;
            }
            LinkedBlockingQueue linkedBlockingQueue = new LinkedBlockingQueue(CollectionHelper.getSublist(arrayList, i2, 10000));
            ExecutorService newFixedThreadPool = Executors.newFixedThreadPool(getNumThreads());
            while (!linkedBlockingQueue.isEmpty()) {
                final String str = (String) linkedBlockingQueue.poll();
                Thread thread = new Thread("Retrieving: " + str) { // from class: ws.palladian.retrieval.WebDocumentRetriever.1
                    @Override // java.lang.Thread, java.lang.Runnable
                    public void run() {
                        Document webDocument;
                        Thread.currentThread().setName("Retrieving: " + str);
                        WebDocumentRetriever.this.getRequestThrottle().hold();
                        if (!WebDocumentRetriever.this.reactToFileTypeConsumer(str, WebDocumentRetriever.this.getFileTypeConsumers()) && (webDocument = WebDocumentRetriever.this.getWebDocument(str, Thread.currentThread())) != null) {
                            webDocument.setUserData(WebDocumentRetriever.ORIGINAL_REQUEST_URL, str, null);
                            consumer.accept(webDocument);
                        }
                        if (progressMonitor != null) {
                            progressMonitor.incrementAndPrintProgress();
                        }
                    }
                };
                if (!newFixedThreadPool.isShutdown()) {
                    newFixedThreadPool.submit(thread);
                }
            }
            newFixedThreadPool.shutdown();
            LOGGER.info("waiting for all 10000 threads to finish...");
            StopWatch stopWatch = new StopWatch();
            while (!newFixedThreadPool.awaitTermination(5L, TimeUnit.SECONDS)) {
                try {
                    LOGGER.debug("wait crawling");
                } catch (InterruptedException e) {
                    LOGGER.error(e.getMessage(), e);
                }
            }
            LOGGER.info("...all threads finished in " + stopWatch.getTotalElapsedTimeString());
            i = i2 + 10000;
        }
    }

    public Set<Document> getWebDocuments(Collection<String> collection) {
        HashSet hashSet = new HashSet();
        for (String str : collection) {
            getRequestThrottle().hold();
            hashSet.add(getWebDocument(str));
        }
        return hashSet;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public boolean reactToFileTypeConsumer(String str, Map<String, Consumer<String>> map) {
        Consumer<String> consumer;
        if (map == null || (consumer = map.get(FileHelper.getFileType(str))) == null) {
            return false;
        }
        consumer.accept(str);
        return true;
    }

    public void close() {
    }

    public Map<String, Consumer<String>> getFileTypeConsumers() {
        return this.fileTypeConsumers;
    }

    public void setFileTypeConsumers(Map<String, Consumer<String>> map) {
        this.fileTypeConsumers = map;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public void callRetrieverCallback(Document document) {
        Iterator<Consumer<Document>> it = this.retrieverCallbacks.iterator();
        while (it.hasNext()) {
            it.next().accept(document);
        }
    }

    public List<Consumer<Document>> getRetrieverCallbacks() {
        return this.retrieverCallbacks;
    }

    public void addRetrieverCallback(Consumer<Document> consumer) {
        this.retrieverCallbacks.add(consumer);
    }

    public void removeRetrieverCallback(Consumer<Document> consumer) {
        this.retrieverCallbacks.remove(consumer);
    }

    public Consumer<DocumentRetrievalTrial> getErrorCallback() {
        return this.errorCallback;
    }

    public void setErrorCallback(Consumer<DocumentRetrievalTrial> consumer) {
        this.errorCallback = consumer;
    }
}
