package ws.palladian.extraction.content;

import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.document.TextDocument;
import de.l3s.boilerpipe.extractors.ArticleExtractor;
import de.l3s.boilerpipe.extractors.ExtractorBase;
import de.l3s.boilerpipe.sax.BoilerpipeSAXInput;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.StringReader;
import org.apache.commons.lang3.Validate;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import ws.palladian.helper.html.HtmlHelper;

/* loaded from: input_file:ws/palladian/extraction/content/BoilerpipeContentExtractor.class */
public class BoilerpipeContentExtractor extends WebPageContentExtractor {
    private final ExtractorBase extractor;
    private TextDocument textDocument;

    public BoilerpipeContentExtractor() {
        this(ArticleExtractor.INSTANCE);
    }

    public BoilerpipeContentExtractor(ExtractorBase extractorBase) {
        Validate.notNull(extractorBase, "extractor must not be null", new Object[0]);
        this.extractor = extractorBase;
    }

    public WebPageContentExtractor setDocument(File file, boolean z) throws PageContentExtractorException {
        try {
            setDocument(new InputSource(new FileInputStream(file)));
            return this;
        } catch (FileNotFoundException e) {
            throw new PageContentExtractorException(e);
        }
    }

    public WebPageContentExtractor setDocument(Document document) throws PageContentExtractorException {
        return setDocument(document, true);
    }

    public WebPageContentExtractor setDocument(Document document, boolean z) throws PageContentExtractorException {
        setDocument(new InputSource(new StringReader(HtmlHelper.xmlToString(document, false))));
        return this;
    }

    public BoilerpipeContentExtractor setDocument(InputSource inputSource) throws PageContentExtractorException {
        try {
            this.textDocument = new BoilerpipeSAXInput(inputSource).getTextDocument();
            this.extractor.process(this.textDocument);
            return this;
        } catch (SAXException e) {
            throw new PageContentExtractorException(e);
        } catch (BoilerpipeProcessingException e2) {
            throw new PageContentExtractorException(e2);
        }
    }

    public Node getResultNode() {
        throw new UnsupportedOperationException();
    }

    public String getResultText() {
        return this.textDocument.getContent();
    }

    public String getResultTitle() {
        return this.textDocument.getTitle();
    }

    public String getExtractorName() {
        return "BoilerpipeContentExtractor(" + this.extractor.getClass().getSimpleName() + ")";
    }
}
