package ws.palladian.retrieval;

import java.io.BufferedInputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import ws.palladian.helper.ProgressMonitor;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.functional.Consumer;
import ws.palladian.helper.functional.Filter;
import ws.palladian.helper.functional.Filters;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.retrieval.FormEncodedHttpEntity;
import ws.palladian.retrieval.helper.NoThrottle;
import ws.palladian.retrieval.helper.RequestThrottle;
import ws.palladian.retrieval.parser.DocumentParser;
import ws.palladian.retrieval.parser.ParserException;
import ws.palladian.retrieval.parser.ParserFactory;
import ws.palladian.retrieval.parser.json.JsonException;
import ws.palladian.retrieval.parser.json.JsonObject;

/* loaded from: input_file:ws/palladian/retrieval/DocumentRetriever.class */
public class DocumentRetriever {
    private static final Logger LOGGER = LoggerFactory.getLogger(DocumentRetriever.class);
    private final HttpRetriever httpRetriever;
    public static final int DEFAULT_NUM_THREADS = 10;
    public static final String HTTP_RESULT_KEY = "httpResult";
    private int numThreads;
    private RequestThrottle requestThrottle;
    private Filter<? super String> downloadFilter;
    private Map<String, String> globalHeaders;
    private final List<Consumer<Document>> retrieverCallbacks;
    private List<String> userAgents;

    public DocumentRetriever() {
        this(HttpRetrieverFactory.getHttpRetriever());
    }

    public DocumentRetriever(HttpRetriever httpRetriever) {
        this.numThreads = 10;
        this.requestThrottle = NoThrottle.INSTANCE;
        this.globalHeaders = null;
        this.httpRetriever = httpRetriever;
        this.downloadFilter = Filters.ALL;
        initializeAgents();
        this.retrieverCallbacks = new ArrayList();
    }

    public Document getWebDocument(String str) {
        return getDocument(str, false);
    }

    public void getWebDocuments(Collection<String> collection, Consumer<Document> consumer) {
        getWebDocuments(collection, consumer, null);
    }

    public void getWebDocuments(Collection<String> collection, Consumer<Document> consumer, Map<String, Consumer<String>> map) {
        getWebDocuments(collection, consumer, map, new ProgressMonitor(collection.size(), 0.5d, "DocumentRetriever"));
    }

    public void getWebDocuments(Collection<String> collection, final Consumer<Document> consumer, final Map<String, Consumer<String>> map, final ProgressMonitor progressMonitor) {
        ArrayList arrayList = new ArrayList(collection);
        int i = 0;
        while (true) {
            int i2 = i;
            if (i2 >= collection.size()) {
                return;
            }
            LinkedBlockingQueue linkedBlockingQueue = new LinkedBlockingQueue(CollectionHelper.getSublist(arrayList, i2, 10000));
            ExecutorService newFixedThreadPool = Executors.newFixedThreadPool(this.numThreads);
            while (!linkedBlockingQueue.isEmpty()) {
                final String str = (String) linkedBlockingQueue.poll();
                Thread thread = new Thread("Retrieving: " + str) { // from class: ws.palladian.retrieval.DocumentRetriever.1
                    @Override // java.lang.Thread, java.lang.Runnable
                    public void run() {
                        Document webDocument;
                        DocumentRetriever.this.requestThrottle.hold();
                        boolean z = false;
                        if (map != null) {
                            Consumer consumer2 = (Consumer) map.get(FileHelper.getFileType(str));
                            if (consumer2 != null) {
                                consumer2.process(str);
                                z = true;
                            }
                        }
                        if (!z && (webDocument = DocumentRetriever.this.getWebDocument(str)) != null) {
                            consumer.process(webDocument);
                        }
                        if (progressMonitor != null) {
                            progressMonitor.incrementAndPrintProgress();
                        }
                    }
                };
                if (!newFixedThreadPool.isShutdown()) {
                    newFixedThreadPool.submit(thread);
                }
            }
            newFixedThreadPool.shutdown();
            LOGGER.info("waiting for all 10000 threads to finish...");
            StopWatch stopWatch = new StopWatch();
            while (!newFixedThreadPool.awaitTermination(5L, TimeUnit.SECONDS)) {
                try {
                    LOGGER.debug("wait crawling");
                } catch (InterruptedException e) {
                    LOGGER.error(e.getMessage(), e);
                }
            }
            LOGGER.info("...all threads finished in " + stopWatch.getTotalElapsedTimeString());
            i = i2 + 10000;
        }
    }

    public Set<Document> getWebDocuments(Collection<String> collection) {
        final HashSet hashSet = new HashSet();
        getWebDocuments(collection, new Consumer<Document>() { // from class: ws.palladian.retrieval.DocumentRetriever.2
            public void process(Document document) {
                synchronized (hashSet) {
                    hashSet.add(document);
                }
            }
        });
        return hashSet;
    }

    public Document getXmlDocument(String str) {
        return getDocument(str, true);
    }

    public JsonObject getJsonObject(String str) throws JsonException {
        String text = getText(str);
        if (text == null) {
            return null;
        }
        String trim = text.trim();
        JsonObject jsonObject = null;
        if (!trim.isEmpty()) {
            jsonObject = new JsonObject(trim);
        }
        return jsonObject;
    }

    public JsonObject getJsonObject(String str, Map<String, String> map, HttpMethod httpMethod) throws JsonException {
        HttpRequest2Builder httpRequest2Builder = new HttpRequest2Builder(httpMethod, str);
        httpRequest2Builder.setEntity(new FormEncodedHttpEntity.Builder().addData(map).m3create());
        try {
            return new JsonObject(HttpRetrieverFactory.getHttpRetriever().execute(httpRequest2Builder.m8create()).getStringContent());
        } catch (HttpException e) {
            throw new IllegalStateException("HTTP exception while accessing: " + e.getMessage(), e);
        }
    }

    public JsonObject getJsonObject(String str, Map<String, String> map) throws JsonException {
        return getJsonObject(str, map, HttpMethod.POST);
    }

    public JsonObject tryGetJsonObject(String str) {
        JsonObject jsonObject = null;
        try {
            jsonObject = getJsonObject(str);
        } catch (JsonException e) {
            e.printStackTrace();
        }
        return jsonObject;
    }

    public String getText(String str) {
        String str2 = null;
        if (this.downloadFilter.accept(str)) {
            try {
                if (isFile(str)) {
                    str2 = FileHelper.readFileToString(str);
                } else {
                    HttpRequest2Builder httpRequest2Builder = new HttpRequest2Builder(HttpMethod.GET, str);
                    if (this.globalHeaders != null) {
                        httpRequest2Builder.addHeaders(this.globalHeaders);
                    }
                    str2 = new String(this.httpRetriever.execute(httpRequest2Builder.m8create()).getContent());
                }
            } catch (Exception e) {
                LOGGER.error(str + ", " + e.getMessage());
            }
        }
        return str2;
    }

    public void getTexts(Collection<String> collection, final Consumer<String> consumer) {
        final LinkedBlockingQueue linkedBlockingQueue = new LinkedBlockingQueue(collection);
        Thread[] threadArr = new Thread[this.numThreads];
        for (int i = 0; i < this.numThreads; i++) {
            threadArr[i] = new Thread() { // from class: ws.palladian.retrieval.DocumentRetriever.3
                @Override // java.lang.Thread, java.lang.Runnable
                public void run() {
                    while (linkedBlockingQueue.size() > 0) {
                        String str = (String) linkedBlockingQueue.poll();
                        if (str == null) {
                            try {
                                Thread.sleep(1000L);
                            } catch (InterruptedException e) {
                                DocumentRetriever.LOGGER.warn("Encountered InterruptedException");
                            }
                        } else {
                            String text = DocumentRetriever.this.getText(str);
                            if (text != null) {
                                consumer.process(text);
                            }
                        }
                    }
                }
            };
            threadArr[i].start();
        }
        for (Thread thread : threadArr) {
            try {
                thread.join();
            } catch (InterruptedException e) {
                LOGGER.warn("Encountered InterruptedException");
            }
        }
    }

    public Set<String> getTexts(Collection<String> collection) {
        final HashSet hashSet = new HashSet();
        getTexts(collection, new Consumer<String>() { // from class: ws.palladian.retrieval.DocumentRetriever.4
            public void process(String str) {
                synchronized (hashSet) {
                    hashSet.add(str);
                }
            }
        });
        return hashSet;
    }

    private Document getDocument(String str, boolean z) {
        Document document = null;
        String trim = str.trim();
        BufferedInputStream bufferedInputStream = null;
        if (this.downloadFilter.accept(trim)) {
            try {
                try {
                    if (isFile(trim)) {
                        File file = new File(trim);
                        bufferedInputStream = new BufferedInputStream(new FileInputStream(new File(trim)));
                        document = parse(bufferedInputStream, z);
                        document.setDocumentURI(file.toURI().toString());
                    } else {
                        HttpRequest2Builder httpRequest2Builder = new HttpRequest2Builder(HttpMethod.GET, trim);
                        if (this.globalHeaders != null) {
                            httpRequest2Builder.addHeaders(this.globalHeaders);
                        }
                        HttpResult execute = this.httpRetriever.execute(httpRequest2Builder.m8create());
                        document = parse(execute, z);
                        document.setDocumentURI(trim);
                        document.setUserData(HTTP_RESULT_KEY, execute, null);
                    }
                    callRetrieverCallback(document);
                    FileHelper.close(new Closeable[]{bufferedInputStream});
                } catch (Exception e) {
                    LOGGER.error(str + ", " + e.getMessage());
                    FileHelper.close(new Closeable[]{bufferedInputStream});
                }
            } catch (Throwable th) {
                FileHelper.close(new Closeable[]{bufferedInputStream});
                throw th;
            }
        }
        return document;
    }

    private static boolean isFile(String str) {
        boolean z = false;
        if (!str.contains("http://") && !str.contains("https://")) {
            z = true;
        }
        return z;
    }

    private DocumentParser getParser(boolean z) {
        return z ? ParserFactory.createXmlParser() : ParserFactory.createHtmlParser();
    }

    private Document parse(HttpResult httpResult, boolean z) throws ParserException {
        return getParser(z).parse(httpResult);
    }

    private Document parse(InputStream inputStream, boolean z) throws ParserException {
        return getParser(z).parse(inputStream);
    }

    public void setNumThreads(int i) {
        this.numThreads = i;
    }

    public int getNumThreads() {
        return this.numThreads;
    }

    public void setDownloadFilter(Filter<String> filter) {
        this.downloadFilter = filter;
    }

    public Filter<? super String> getDownloadFilter() {
        return this.downloadFilter;
    }

    private void callRetrieverCallback(Document document) {
        Iterator<Consumer<Document>> it = this.retrieverCallbacks.iterator();
        while (it.hasNext()) {
            it.next().process(document);
        }
    }

    public List<Consumer<Document>> getRetrieverCallbacks() {
        return this.retrieverCallbacks;
    }

    public void addRetrieverCallback(Consumer<Document> consumer) {
        this.retrieverCallbacks.add(consumer);
    }

    public void removeRetrieverCallback(Consumer<Document> consumer) {
        this.retrieverCallbacks.remove(consumer);
    }

    public Map<String, String> getGlobalHeaders() {
        return this.globalHeaders;
    }

    public void setGlobalHeaders(Map<String, String> map) {
        this.globalHeaders = map;
    }

    public RequestThrottle getRequestThrottle() {
        return this.requestThrottle;
    }

    public void setRequestThrottle(RequestThrottle requestThrottle) {
        this.requestThrottle = requestThrottle;
    }

    private void initializeAgents() {
        this.userAgents = new ArrayList();
        this.userAgents.add("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0");
        this.userAgents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.52.7 (KHTML, like Gecko) Version/5.1 Safari/534.50");
        this.userAgents.add("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)");
        this.userAgents.add(HttpRetriever.USER_AGENT);
        this.userAgents.add("Opera/9.80 (Windows NT 6.1; U; en) Presto/2.2.15 Version/10.10");
        this.userAgents.add("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1");
        this.userAgents.add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; InfoPath.2; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET CLR 1.1.4322)");
        this.userAgents.add("Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.0");
        this.userAgents.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1");
        this.userAgents.add("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)");
        this.userAgents.add("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1");
        this.userAgents.add("Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.34 (KHTML, like Gecko) rekonq Safari/534.34");
        this.userAgents.add("Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB6; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; OfficeLiveConnector.1.4; OfficeLivePatch.1.3)");
        this.userAgents.add("IE 7 ? Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)");
        this.userAgents.add("Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.23) Gecko/20110920 Firefox/3.6.23 SearchToolbar/1.2");
        this.userAgents.add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506; .NET CLR 1.1.4322; InfoPath.2; .NET CLR 3.5.21022)");
        this.userAgents.add("Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET CLR 1.1.4322; Tablet PC 2.0; OfficeLiveConnector.1.3; OfficeLivePatch.1.3; MS-RTC LM 8; InfoPath.3)");
        this.userAgents.add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; FDM; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 1.1.4322)");
    }

    public void switchAgent() {
        this.httpRetriever.setUserAgent(this.userAgents.get((int) (Math.random() * this.userAgents.size())));
    }

    public HttpRetriever getHttpRetriever() {
        return this.httpRetriever;
    }

    public static void main(String[] strArr) throws Exception {
        DocumentRetriever documentRetriever = new DocumentRetriever();
        System.exit(0);
        documentRetriever.addRetrieverCallback(new Consumer<Document>() { // from class: ws.palladian.retrieval.DocumentRetriever.5
            public void process(Document document) {
                DocumentRetriever.LOGGER.info(document.getDocumentURI());
            }
        });
        HashSet hashSet = new HashSet();
        hashSet.add("http://www.cinefreaks.com");
        hashSet.add("http://www.imdb.com");
        documentRetriever.setNumThreads(10);
        CollectionHelper.print(documentRetriever.getWebDocuments(hashSet));
        LOGGER.info(documentRetriever.getWebDocument("http://www.cinefreaks.com").getDocumentURI());
    }
}
