package ws.palladian.extraction.content;

import java.util.Arrays;
import org.apache.commons.lang3.Validate;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import ws.palladian.core.Instance;
import ws.palladian.helper.UrlHelper;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.retrieval.HttpException;
import ws.palladian.retrieval.HttpResult;
import ws.palladian.retrieval.HttpRetriever;
import ws.palladian.retrieval.HttpRetrieverFactory;
import ws.palladian.retrieval.parser.DocumentParser;
import ws.palladian.retrieval.parser.ParserException;
import ws.palladian.retrieval.parser.ParserFactory;

/* loaded from: input_file:ws/palladian/extraction/content/ReadItLaterContentExtractor.class */
public class ReadItLaterContentExtractor extends WebPageContentExtractor {
    private final String apiKey;
    private final HttpRetriever httpRetriever;
    private final DocumentParser htmlParser;
    private String extractedResult;
    private Document extractedDocument;

    public ReadItLaterContentExtractor(String str) {
        Validate.notEmpty(str, "apiKey must not be empty", new Object[0]);
        this.apiKey = str;
        this.httpRetriever = HttpRetrieverFactory.getHttpRetriever();
        this.htmlParser = ParserFactory.createHtmlParser();
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public WebPageContentExtractor setDocument(Document document) throws PageContentExtractorException {
        return setDocument(document, true);
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public WebPageContentExtractor setDocument(Document document, boolean z) throws PageContentExtractorException {
        String documentURI = document.getDocumentURI();
        try {
            HttpResult httpGet = this.httpRetriever.httpGet(String.format("http://text.readitlaterlist.com/v2/text?apikey=%s&url=%s", this.apiKey, UrlHelper.encodeParameter(documentURI)));
            this.extractedResult = httpGet.getStringContent();
            try {
                this.extractedDocument = this.htmlParser.parse(httpGet);
                return this;
            } catch (ParserException e) {
                throw new PageContentExtractorException("Error when parsing the result HTML for URL \"" + documentURI + "\": " + e.getMessage(), e);
            }
        } catch (HttpException e2) {
            throw new PageContentExtractorException("Error when contacting API for URL \"" + documentURI + "\": " + e2.getMessage(), e2);
        }
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public Node getResultNode() {
        return this.extractedDocument;
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getResultText() {
        return HtmlHelper.documentToReadableText(this.extractedDocument);
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getResultTitle() {
        String str = Instance.NO_CATEGORY_DUMMY;
        for (String str2 : Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6")) {
            str = StringHelper.getRegexpMatch(String.format("<%s.*?>(.*?)</%s>", str2, str2), this.extractedResult, true, false);
            if (!str.isEmpty()) {
                break;
            }
        }
        return HtmlHelper.stripHtmlTags(str);
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getExtractorName() {
        return "ReadItLater";
    }

    public static void main(String[] strArr) {
        ReadItLaterContentExtractor readItLaterContentExtractor = new ReadItLaterContentExtractor("a62g2W68p36ema12fvTc410Td1A1Na62");
        String resultText = readItLaterContentExtractor.getResultText("http://www.bbc.co.uk/news/world-asia-17116595");
        System.out.println("title: " + readItLaterContentExtractor.getResultTitle());
        System.out.println("text: " + resultText);
    }
}
