package ws.palladian.retrieval;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:ws/palladian/retrieval/CascadingDocumentRetriever.class */
public class CascadingDocumentRetriever extends JsEnabledDocumentRetriever {
    private static final Logger LOGGER = LoggerFactory.getLogger(CascadingDocumentRetriever.class);
    public static final String RETRIEVER_PLAIN = "Plain";
    public static final String RETRIEVER_RENDERING_POOL = "RenderingPool";
    public static final String RETRIEVER_PHANTOM_JS_CLOUD = "PhantomJsCloud";
    public static final String RETRIEVER_PROXY_CRAWL = "ProxyCrawl";
    public static final String RETRIEVER_SCRAPING_BEE = "ScrapingBee";
    private List<String> badDocumentIndicatorTexts;
    private final Map<String, Integer[]> failingThresholdAndNumberOfRequestsToSkip = new HashMap();
    private final Map<String, Integer[]> requestTracker = new HashMap();
    private final DocumentRetriever documentRetriever;
    private final RenderingDocumentRetrieverPool renderingDocumentRetrieverPool;
    private final PhantomJsDocumentRetriever cloudDocumentRetriever;
    private final ProxyCrawlDocumentRetriever cloudDocumentRetriever2;
    private final ScrapingBeeDocumentRetriever cloudDocumentRetriever3;

    public CascadingDocumentRetriever(DocumentRetriever documentRetriever, RenderingDocumentRetrieverPool renderingDocumentRetrieverPool, PhantomJsDocumentRetriever phantomJsDocumentRetriever, ProxyCrawlDocumentRetriever proxyCrawlDocumentRetriever, ScrapingBeeDocumentRetriever scrapingBeeDocumentRetriever) {
        this.documentRetriever = documentRetriever;
        this.renderingDocumentRetrieverPool = renderingDocumentRetrieverPool;
        this.cloudDocumentRetriever = phantomJsDocumentRetriever;
        this.cloudDocumentRetriever2 = proxyCrawlDocumentRetriever;
        this.cloudDocumentRetriever3 = scrapingBeeDocumentRetriever;
        if (this.documentRetriever != null) {
            this.requestTracker.put(RETRIEVER_PLAIN, new Integer[]{0, 0, 0});
        }
        if (this.renderingDocumentRetrieverPool != null) {
            this.requestTracker.put(RETRIEVER_RENDERING_POOL, new Integer[]{0, 0, 0});
        }
        if (this.cloudDocumentRetriever != null) {
            this.requestTracker.put(RETRIEVER_PHANTOM_JS_CLOUD, new Integer[]{0, 0, 0});
        }
        if (this.cloudDocumentRetriever2 != null) {
            this.requestTracker.put(RETRIEVER_PROXY_CRAWL, new Integer[]{0, 0, 0});
        }
        if (this.cloudDocumentRetriever3 != null) {
            this.requestTracker.put(RETRIEVER_SCRAPING_BEE, new Integer[]{0, 0, 0});
        }
    }

    public String getBadDocumentIndicatorText() {
        if (this.badDocumentIndicatorTexts == null) {
            return null;
        }
        return (String) CollectionHelper.getFirst(this.badDocumentIndicatorTexts);
    }

    public void setBadDocumentIndicatorText(String str) {
        this.badDocumentIndicatorTexts = Arrays.asList(str);
    }

    public List<String> getBadDocumentIndicatorTexts() {
        return this.badDocumentIndicatorTexts != null ? this.badDocumentIndicatorTexts : new ArrayList(1);
    }

    public void setBadDocumentIndicatorTexts(List<String> list) {
        this.badDocumentIndicatorTexts = list;
    }

    public void addBadDocumentIndicatorText(String str) {
        if (this.badDocumentIndicatorTexts == null) {
            this.badDocumentIndicatorTexts = new ArrayList();
        }
        this.badDocumentIndicatorTexts.add(str);
    }

    public String getText(String str, List<String> list) {
        Document webDocument = getWebDocument(str, list, null);
        if (webDocument == null) {
            return null;
        }
        return HtmlHelper.getInnerXml(webDocument);
    }

    public void pauseFailingRetriever(String str, Integer num, Integer num2) {
        this.failingThresholdAndNumberOfRequestsToSkip.put(str, new Integer[]{num, num2});
    }

    @Override // ws.palladian.retrieval.WebDocumentRetriever
    public Document getWebDocument(String str) {
        return getWebDocument(str, null, null);
    }

    @Override // ws.palladian.retrieval.WebDocumentRetriever
    public Document getWebDocument(String str, Thread thread) {
        return getWebDocument(str, null, thread);
    }

    public Document getWebDocument(String str, List<String> list, Thread thread) {
        if (list == null) {
            list = new ArrayList();
        }
        StopWatch stopWatch = new StopWatch();
        Document document = null;
        boolean z = false;
        if (this.documentRetriever != null && shouldMakeRequest(RETRIEVER_PLAIN)) {
            if (thread != null) {
                try {
                    thread.setName("Retrieving (plain): " + str);
                } catch (Exception e) {
                    LOGGER.error(e.getMessage(), e);
                }
            }
            document = this.documentRetriever.getWebDocument(str);
            z = isGoodDocument(document);
            String str2 = z ? "success" : "fail";
            updateRequestTracker(RETRIEVER_PLAIN, z);
            list.add("used normal document retriever: " + str2 + " in " + stopWatch.getElapsedTimeStringAndIncrement() + " success count: " + getSuccessfulRequestCount(RETRIEVER_PLAIN));
        }
        if (!z && this.renderingDocumentRetrieverPool != null && shouldMakeRequest(RETRIEVER_RENDERING_POOL)) {
            RenderingDocumentRetriever renderingDocumentRetriever = null;
            try {
                if (thread != null) {
                    try {
                        thread.setName("Retrieving (rendering): " + str);
                    } catch (Exception e2) {
                        e2.printStackTrace();
                        if (renderingDocumentRetriever != null) {
                            this.renderingDocumentRetrieverPool.recycle(renderingDocumentRetriever);
                        }
                    }
                }
                renderingDocumentRetriever = (RenderingDocumentRetriever) this.renderingDocumentRetrieverPool.acquire();
                configure(renderingDocumentRetriever);
                document = renderingDocumentRetriever.getWebDocument(str);
                z = isGoodDocument(document);
                String str3 = z ? "success" : "fail";
                updateRequestTracker(RETRIEVER_RENDERING_POOL, z);
                list.add("used rendering js retriever: " + str3 + " in " + stopWatch.getElapsedTimeStringAndIncrement() + " success count: " + getSuccessfulRequestCount(RETRIEVER_RENDERING_POOL));
                if (renderingDocumentRetriever != null) {
                    this.renderingDocumentRetrieverPool.recycle(renderingDocumentRetriever);
                }
            } catch (Throwable th) {
                if (renderingDocumentRetriever != null) {
                    this.renderingDocumentRetrieverPool.recycle(renderingDocumentRetriever);
                }
                throw th;
            }
        }
        if (!z && this.cloudDocumentRetriever != null && shouldMakeRequest(RETRIEVER_PHANTOM_JS_CLOUD)) {
            if (thread != null) {
                thread.setName("Retrieving (phantomjs): " + str);
            }
            configure(this.cloudDocumentRetriever);
            document = this.cloudDocumentRetriever.getWebDocument(str);
            z = isGoodDocument(document);
            String str4 = z ? "success" : "fail";
            updateRequestTracker(RETRIEVER_PHANTOM_JS_CLOUD, z);
            list.add("used phantom js cloud document retriever: " + str4 + " in " + stopWatch.getElapsedTimeStringAndIncrement() + " success count: " + getSuccessfulRequestCount(RETRIEVER_PHANTOM_JS_CLOUD));
        }
        if (!z && this.cloudDocumentRetriever2 != null && shouldMakeRequest(RETRIEVER_PROXY_CRAWL)) {
            if (thread != null) {
                thread.setName("Retrieving (proxycrawl): " + str);
            }
            configure(this.cloudDocumentRetriever2);
            document = this.cloudDocumentRetriever2.getWebDocument(str);
            z = isGoodDocument(document);
            String str5 = z ? "success" : "fail";
            updateRequestTracker(RETRIEVER_PROXY_CRAWL, z);
            list.add("used proxy crawl document retriever: " + str5 + " in " + stopWatch.getElapsedTimeStringAndIncrement() + " success count: " + getSuccessfulRequestCount(RETRIEVER_PROXY_CRAWL));
        }
        if (!z && this.cloudDocumentRetriever3 != null && shouldMakeRequest(RETRIEVER_SCRAPING_BEE)) {
            if (thread != null) {
                thread.setName("Retrieving (scrapingbee): " + str);
            }
            configure(this.cloudDocumentRetriever3);
            document = this.cloudDocumentRetriever3.getWebDocument(str);
            boolean isGoodDocument = isGoodDocument(document);
            String str6 = isGoodDocument ? "success" : "fail";
            updateRequestTracker(RETRIEVER_SCRAPING_BEE, isGoodDocument);
            list.add("used scraping bee document retriever: " + str6 + " in " + stopWatch.getElapsedTimeStringAndIncrement() + " success count: " + getSuccessfulRequestCount(RETRIEVER_SCRAPING_BEE));
        }
        if (document != null) {
            callRetrieverCallback(document);
        }
        return document;
    }

    private void configure(JsEnabledDocumentRetriever jsEnabledDocumentRetriever) {
        jsEnabledDocumentRetriever.deleteAllCookies();
        jsEnabledDocumentRetriever.getWaitForElementMap().clear();
        jsEnabledDocumentRetriever.setWaitForElementMap(getWaitForElementMap());
        jsEnabledDocumentRetriever.setTimeoutSeconds(getTimeoutSeconds());
        jsEnabledDocumentRetriever.setWaitExceptionCallback(getWaitExceptionCallback());
        jsEnabledDocumentRetriever.setCookies(this.cookies);
    }

    private void updateRequestTracker(String str, boolean z) {
        Integer[] numArr = this.requestTracker.get(str);
        if (numArr == null) {
            return;
        }
        if (z) {
            Integer num = numArr[2];
            numArr[2] = Integer.valueOf(numArr[2].intValue() + 1);
        } else {
            Integer num2 = numArr[0];
            numArr[0] = Integer.valueOf(numArr[0].intValue() + 1);
        }
    }

    private boolean shouldMakeRequest(String str) {
        Integer[] numArr;
        Integer[] numArr2 = this.failingThresholdAndNumberOfRequestsToSkip.get(str);
        if (numArr2 == null || (numArr = this.requestTracker.get(str)) == null || numArr[0].intValue() < numArr2[0].intValue()) {
            return true;
        }
        if (numArr[1].intValue() >= numArr2[1].intValue()) {
            numArr[0] = 0;
            numArr[1] = 0;
            return true;
        }
        Integer num = numArr[1];
        numArr[1] = Integer.valueOf(numArr[1].intValue() + 1);
        return false;
    }

    public Integer getSuccessfulRequestCount(String str) {
        Integer[] numArr = this.requestTracker.get(str);
        if (numArr == null || numArr.length < 3) {
            return null;
        }
        return numArr[2];
    }

    private boolean isGoodDocument(Document document) {
        if (document == null) {
            return false;
        }
        String documentToReadableText = HtmlHelper.documentToReadableText(document);
        return (StringHelper.containsAny(documentToReadableText, getBadDocumentIndicatorTexts()) || documentToReadableText.isEmpty()) ? false : true;
    }

    @Override // ws.palladian.retrieval.JsEnabledDocumentRetriever
    public void setTimeoutSeconds(int i) {
        super.setTimeoutSeconds(i);
        if (this.documentRetriever != null) {
            this.documentRetriever.getHttpRetriever().setConnectionTimeout((int) TimeUnit.SECONDS.toMillis(getTimeoutSeconds()));
        }
    }

    public boolean renderJs(boolean z) {
        boolean z2 = false;
        if (this.cloudDocumentRetriever2 != null) {
            z2 = this.cloudDocumentRetriever2.isUseJsRendering();
            this.cloudDocumentRetriever2.setUseJsRendering(z);
        }
        if (this.cloudDocumentRetriever3 != null) {
            z2 = this.cloudDocumentRetriever3.isUseJsRendering();
            this.cloudDocumentRetriever3.setUseJsRendering(z);
        }
        return z2;
    }
}
