package ws.palladian.extraction.content;

import java.io.File;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import ws.palladian.core.Instance;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:ws/palladian/extraction/content/ReadabilityContentExtractor.class */
public class ReadabilityContentExtractor extends WebPageContentExtractor {
    private static final String READABILITY_ATTR = "readability";
    private Document document;
    private Document resultNode;
    private boolean weightClasses;
    private boolean stripUnlikelyCandidates;
    private boolean cleanConditionally;
    private boolean writeDump = false;
    private static final Logger LOGGER = LoggerFactory.getLogger(ReadabilityContentExtractor.class);
    private static final Pattern UNLIKELY_CANDIDATES_RE = Pattern.compile("combx|comment|community|disqus|extra|foot|header|legal|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup", 2);
    private static final Pattern OK_MAYBE_ITS_A_CANDIDATE_RE = Pattern.compile("and|article|body|column|main|shadow", 2);
    private static final Pattern POSITIVE_RE = Pattern.compile("article|body|content|entry|hentry|main|page|pagination|post|text|blog|story", 2);
    private static final Pattern NEGATIVE_RE = Pattern.compile("combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget", 2);
    private static final Pattern DIV_TO_P_ELEMENTS_RE = Pattern.compile("<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", 2);
    private static final Pattern NORMALIZE_RE = Pattern.compile("\\s{2,}");
    private static final Pattern VIDEO_RE = Pattern.compile("http:\\/\\/(www\\.)?(youtube|vimeo)\\.com", 2);

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public WebPageContentExtractor setDocument(Document document) throws PageContentExtractorException {
        return setDocument(document, true);
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public WebPageContentExtractor setDocument(Document document, boolean z) throws PageContentExtractorException {
        this.document = document;
        this.stripUnlikelyCandidates = true;
        this.weightClasses = true;
        this.cleanConditionally = true;
        this.resultNode = init(document);
        return this;
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public Node getResultNode() {
        return this.resultNode;
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getResultText() {
        return HtmlHelper.documentToReadableText(getResultNode());
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getResultTitle() {
        return getArticleTitle(this.document);
    }

    public void setWriteDump(boolean z) {
        this.writeDump = z;
    }

    public boolean isWriteDump() {
        return this.writeDump;
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getExtractorName() {
        return "Readability";
    }

    private Document init(Document document) throws PageContentExtractorException {
        Document cloneDocument = HtmlHelper.cloneDocument(document);
        if (cloneDocument == null) {
            throw new PageContentExtractorException("caching the original document failed.");
        }
        Document grabArticle = grabArticle(cloneDocument);
        if (isWriteDump()) {
            String str = "dumps/pageContentExtractor" + System.currentTimeMillis() + ".xml";
            HtmlHelper.writeToFile(cloneDocument, new File(str));
            LOGGER.info("wrote dump to {}", str);
        }
        if (grabArticle == null || getInnerText(grabArticle.getDocumentElement(), false).length() < 250) {
            if (this.stripUnlikelyCandidates) {
                this.stripUnlikelyCandidates = false;
                LOGGER.debug("re-running without stripping unlikely candidates");
                grabArticle = init(document);
            } else if (this.weightClasses) {
                this.weightClasses = false;
                LOGGER.debug("re-running without class weigths");
                grabArticle = init(document);
            } else if (this.cleanConditionally) {
                this.cleanConditionally = false;
                LOGGER.debug("re-running without conditional cleaning");
                grabArticle = init(document);
            } else {
                LOGGER.debug("looks like I could not parse this page for content (result looks too short)");
            }
        }
        if (grabArticle != null) {
            NodeList elementsByTagName = grabArticle.getElementsByTagName("p");
            for (int length = elementsByTagName.getLength() - 1; length >= 0; length--) {
                Element element = (Element) elementsByTagName.item(length);
                if (element.getAttribute("style").equals("display:inline")) {
                    element.getParentNode().replaceChild(grabArticle.createTextNode(element.getTextContent()), element);
                }
            }
            NodeList elementsByTagName2 = grabArticle.getElementsByTagName("*");
            for (int i = 0; i < elementsByTagName2.getLength(); i++) {
                Element element2 = (Element) elementsByTagName2.item(i);
                element2.removeAttribute("class");
                element2.removeAttribute(READABILITY_ATTR);
            }
        }
        return grabArticle;
    }

    private String getArticleTitle(Document document) {
        String str = Instance.NO_CATEGORY_DUMMY;
        String str2 = Instance.NO_CATEGORY_DUMMY;
        NodeList elementsByTagName = document.getElementsByTagName("title");
        if (elementsByTagName.getLength() == 1) {
            String innerText = getInnerText((Element) elementsByTagName.item(0));
            str2 = innerText;
            str = innerText;
        }
        if (Pattern.compile(" [\\|\\-] ").matcher(str).find()) {
            str = str2.replaceAll("(.*)[\\|\\-] .*", "$1");
            if (str.split(" ").length < 3) {
                str = str2.replaceAll("[^\\|\\-]*[\\|\\-](.*)", "$1");
            }
        } else if (str.indexOf(": ") != -1) {
            str = str2.replaceAll(".*:/(.*)", "$1");
            if (str.split(" ").length < 3) {
                str = str2.replaceAll("[^:]*[:](.*)", "$1");
            }
        } else if (str.length() > 150 || str.length() < 15) {
            NodeList elementsByTagName2 = document.getElementsByTagName("h1");
            if (elementsByTagName2.getLength() == 1) {
                str = getInnerText((Element) elementsByTagName2.item(0));
            }
        }
        String trim = str.trim();
        if (trim.split(" ").length <= 4) {
            trim = str2;
        }
        return trim;
    }

    private void prepDocument(Document document) {
        HtmlHelper.removeAll(document, (short) 1, "script");
        HtmlHelper.removeAll(document, (short) 1, "style");
        HtmlHelper.removeAll(document, (short) 8);
        cleanStyles(document.getDocumentElement());
    }

    private void prepArticle(Element element) {
        cleanConditionally(element, "form");
        clean(element, "object");
        clean(element, "h1");
        clean(element, "noscript");
        if (element.getElementsByTagName("h2").getLength() == 1) {
            clean(element, "h2");
        }
        clean(element, "iframe");
        cleanHeaders(element);
        cleanConditionally(element, "table");
        cleanConditionally(element, "ul");
        cleanConditionally(element, "div");
        NodeList elementsByTagName = element.getElementsByTagName("p");
        for (int length = elementsByTagName.getLength() - 1; length >= 0; length--) {
            Element element2 = (Element) elementsByTagName.item(length);
            int length2 = element2.getElementsByTagName("img").getLength();
            int length3 = element2.getElementsByTagName("embed").getLength();
            int length4 = element2.getElementsByTagName("object").getLength();
            if (length2 == 0 && length3 == 0 && length4 == 0 && getInnerText(element2, false).length() == 0) {
                element2.getParentNode().removeChild(element2);
            }
        }
    }

    private void initializeNode(Element element) {
        String lowerCase = element.getTagName().toLowerCase();
        int i = 0;
        if (lowerCase.equals("div")) {
            i = 0 + 5;
        } else if (Arrays.asList("pre", "td", "blockquote").contains(lowerCase)) {
            i = 0 + 3;
        } else if (Arrays.asList("address", "ol", "ul", "dl", "dd", "dt", "li", "form").contains(lowerCase)) {
            i = 0 - 3;
        } else if (Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6", "th").contains(lowerCase)) {
            i = 0 - 5;
        }
        setReadability(element, i + getClassIdWeight(element));
    }

    private Document grabArticle(Document document) {
        Element element;
        prepDocument(document);
        LinkedList<Element> linkedList = new LinkedList();
        NodeList elementsByTagName = document.getElementsByTagName("*");
        int i = 0;
        while (i < elementsByTagName.getLength()) {
            Element element2 = (Element) elementsByTagName.item(i);
            if (this.stripUnlikelyCandidates) {
                String str = element2.getAttribute("class") + element2.getAttribute("id");
                if (UNLIKELY_CANDIDATES_RE.matcher(str).find() && !OK_MAYBE_ITS_A_CANDIDATE_RE.matcher(str).find() && !element2.getTagName().equalsIgnoreCase("body")) {
                    LOGGER.debug("Removing unlikely candidate - {}", str);
                    element2.getParentNode().removeChild(element2);
                    i--;
                    i++;
                }
            }
            if (element2.getTagName().equalsIgnoreCase("p") || element2.getTagName().equalsIgnoreCase("td")) {
                linkedList.add(element2);
            }
            if (element2.getTagName().equalsIgnoreCase("div")) {
                if (DIV_TO_P_ELEMENTS_RE.matcher(HtmlHelper.getInnerXml(element2)).find()) {
                    for (int i2 = 0; i2 < element2.getChildNodes().getLength(); i2++) {
                        Node item = element2.getChildNodes().item(i2);
                        if (item.getNodeType() == 3 && item.getTextContent().trim().length() > 0) {
                            LOGGER.debug("replacing text node with a p tag with the same content.");
                            Element createElement = document.createElement("p");
                            createElement.setAttribute("style", "display:inline");
                            createElement.setTextContent(item.getTextContent());
                            item.getParentNode().replaceChild(createElement, item);
                        }
                    }
                } else {
                    LOGGER.debug("Altering div to p");
                    document.renameNode(element2, element2.getNamespaceURI(), "p");
                    i--;
                    linkedList.add(element2);
                }
            }
            i++;
        }
        LinkedList<Element> linkedList2 = new LinkedList();
        for (Element element3 : linkedList) {
            Node parentNode = element3.getParentNode();
            if (parentNode != null) {
                Node parentNode2 = parentNode.getParentNode();
                if (getInnerText(element3).length() >= 25) {
                    int length = (int) (0 + 1 + r0.split(",").length + Math.min(Math.floor(r0.length() / 100.0f), 3.0d));
                    if (parentNode.getNodeType() == 1) {
                        Element element4 = (Element) parentNode;
                        if (!hasReadability(element4)) {
                            initializeNode(element4);
                            linkedList2.add(element4);
                        }
                        setReadability(element4, getReadability(element4) + length);
                    }
                    if (parentNode2 != null && parentNode2.getNodeType() == 1) {
                        Element element5 = (Element) parentNode2;
                        if (!hasReadability(element5)) {
                            initializeNode(element5);
                            linkedList2.add(element5);
                        }
                        setReadability(element5, getReadability(element5) + (length / 2.0f));
                    }
                }
            }
        }
        Element element6 = null;
        for (Element element7 : linkedList2) {
            float readability = getReadability(element7) * (1.0f - getLinkDensity(element7));
            setReadability(element7, readability);
            LOGGER.debug("Candidate: {} ({}:{}) with score {}", new Object[]{element7, element7.getAttribute("class"), element7.getAttribute("id"), Float.valueOf(readability)});
            if (element6 == null || readability > getReadability(element6)) {
                element6 = element7;
            }
        }
        if (element6 == null) {
            LOGGER.debug("No top candidate found, using the body");
            NodeList elementsByTagName2 = document.getElementsByTagName("body");
            if (elementsByTagName2.getLength() <= 0) {
                return null;
            }
            element6 = (Element) elementsByTagName2.item(0);
            document.renameNode(element6, element6.getNamespaceURI(), "div");
        }
        Document createDocument = HtmlHelper.createDocument();
        Element createElementNS = createDocument.createElementNS("http://www.w3.org/1999/xhtml", "html");
        createDocument.appendChild(createElementNS);
        Element createElement2 = createDocument.createElement("body");
        createElementNS.appendChild(createElement2);
        float max = Math.max(10.0f, getReadability(element6) * 0.2f);
        NodeList childNodes = element6.getParentNode().getChildNodes();
        for (int i3 = 0; i3 < childNodes.getLength(); i3++) {
            if (childNodes.item(i3).getNodeType() == 1) {
                Element element8 = (Element) childNodes.item(i3);
                Logger logger = LOGGER;
                Object[] objArr = new Object[4];
                objArr[0] = element8;
                objArr[1] = element8.getAttribute("class");
                objArr[2] = element8.getAttribute("id");
                objArr[3] = hasReadability(element8) ? " with score " + getReadability(element8) : Instance.NO_CATEGORY_DUMMY;
                logger.debug("Looking at sibling node: {} ({}:{}) {}", objArr);
                boolean z = element8 == element6;
                int i4 = 0;
                if (element6.getAttribute("class").length() > 0 && element8.getAttribute("class").equals(element6.getAttribute("class"))) {
                    i4 = (int) (0 + (getReadability(element6) * 0.2d));
                }
                if (hasReadability(element8) && getReadability(element8) + i4 >= max) {
                    z = true;
                }
                if (element8.getNodeName().equalsIgnoreCase("p")) {
                    float linkDensity = getLinkDensity(element8);
                    String innerText = getInnerText(element8);
                    int length2 = innerText.length();
                    if (length2 > 80 && linkDensity < 0.25d) {
                        z = true;
                    } else if (length2 < 80 && linkDensity == 0.0f && Pattern.compile("\\.( |$)").matcher(innerText).find()) {
                        z = true;
                    }
                }
                if (z) {
                    LOGGER.debug("Appending node: {}", element8);
                    if (element8.getNodeName().equalsIgnoreCase("div") || element8.getNodeName().equalsIgnoreCase("p")) {
                        element = element8;
                    } else {
                        LOGGER.debug("Altering siblingNode of {} to div.", element8.getNodeName());
                        element = (Element) document.renameNode(element8, element8.getNamespaceURI(), "div");
                    }
                    createElement2.appendChild(createDocument.importNode(element, true));
                }
            }
        }
        prepArticle(createElement2);
        return createDocument;
    }

    private String getInnerText(Element element) {
        return getInnerText(element, true);
    }

    private String getInnerText(Element element, boolean z) {
        String trim = element.getTextContent().trim();
        if (z) {
            trim = NORMALIZE_RE.matcher(trim).replaceAll(" ");
        }
        return trim;
    }

    private void cleanStyles(Element element) {
        Node firstChild = element.getFirstChild();
        if (firstChild == null) {
            return;
        }
        element.removeAttribute("style");
        while (firstChild != null) {
            if (firstChild.getNodeType() == 1) {
                Element element2 = (Element) firstChild;
                element2.removeAttribute("style");
                cleanStyles(element2);
            }
            firstChild = firstChild.getNextSibling();
        }
    }

    private float getLinkDensity(Element element) {
        NodeList elementsByTagName = element.getElementsByTagName("a");
        int length = getInnerText(element).length();
        int i = 0;
        for (int i2 = 0; i2 < elementsByTagName.getLength(); i2++) {
            i += getInnerText((Element) elementsByTagName.item(i2)).length();
        }
        return length != 0 ? i / length : 0.0f;
    }

    private int getClassIdWeight(Element element) {
        if (!this.weightClasses) {
            return 0;
        }
        int i = 0;
        if (element.hasAttribute("class")) {
            if (NEGATIVE_RE.matcher(element.getAttribute("class")).find()) {
                i = 0 - 25;
            }
            if (POSITIVE_RE.matcher(element.getAttribute("class")).find()) {
                i += 25;
            }
        }
        if (element.hasAttribute("id")) {
            if (NEGATIVE_RE.matcher(element.getAttribute("id")).find()) {
                i -= 25;
            }
            if (POSITIVE_RE.matcher(element.getAttribute("id")).find()) {
                i += 25;
            }
        }
        return i;
    }

    private void clean(Element element, String str) {
        NodeList elementsByTagName = element.getElementsByTagName(str);
        boolean z = str.equalsIgnoreCase("object") || str.equalsIgnoreCase("embed");
        for (int length = elementsByTagName.getLength() - 1; length >= 0; length--) {
            Node item = elementsByTagName.item(length);
            if (z) {
                StringBuilder sb = new StringBuilder();
                for (int i = 0; i < item.getAttributes().getLength(); i++) {
                    sb.append(item.getAttributes().item(i).getTextContent() + "|");
                }
                if (!VIDEO_RE.matcher(sb).find()) {
                    if (VIDEO_RE.matcher(item.getTextContent()).find()) {
                    }
                }
            }
            item.getParentNode().removeChild(item);
        }
    }

    private void cleanConditionally(Element element, String str) {
        if (this.cleanConditionally) {
            NodeList elementsByTagName = element.getElementsByTagName(str);
            for (int length = elementsByTagName.getLength() - 1; length >= 0; length--) {
                Element element2 = (Element) elementsByTagName.item(length);
                int classIdWeight = getClassIdWeight(element2);
                float readability = getReadability(element2);
                Logger logger = LOGGER;
                Object[] objArr = new Object[4];
                objArr[0] = element2;
                objArr[1] = element2.getAttribute("class");
                objArr[2] = element2.getAttribute("id");
                objArr[3] = hasReadability(element2) ? " with score " + getReadability(element2) : Instance.NO_CATEGORY_DUMMY;
                logger.debug("Cleaning Conditionally {} ({}:{}) {}", objArr);
                if (classIdWeight + readability < 0.0f) {
                    element2.getParentNode().removeChild(element2);
                } else if (StringHelper.countOccurrences(element2.getTextContent(), ",") < 10) {
                    int length2 = element2.getElementsByTagName("p").getLength();
                    int length3 = element2.getElementsByTagName("img").getLength();
                    int length4 = element2.getElementsByTagName("li").getLength() - 100;
                    int length5 = element2.getElementsByTagName("input").getLength();
                    int i = 0;
                    NodeList elementsByTagName2 = element2.getElementsByTagName("embed");
                    for (int i2 = 0; i2 < elementsByTagName2.getLength(); i2++) {
                        if (VIDEO_RE.matcher(((Element) elementsByTagName2.item(i2)).getAttribute("src")).find()) {
                            i++;
                        }
                    }
                    float linkDensity = getLinkDensity(element2);
                    int length6 = getInnerText(element2).length();
                    boolean z = false;
                    if (length3 > length2) {
                        z = true;
                    } else if (length4 > length2 && !str.equalsIgnoreCase("ul") && !str.equalsIgnoreCase("ol")) {
                        z = true;
                    } else if (length5 > Math.floor(length2 / 3.0d)) {
                        z = true;
                    } else if (length6 < 25 && (length3 == 0 || length3 > 2)) {
                        z = true;
                    } else if (classIdWeight < 25 && linkDensity > 0.2d) {
                        z = true;
                    } else if (classIdWeight >= 25 && linkDensity > 0.5d) {
                        z = true;
                    } else if ((i == 1 && length6 < 75) || i > 1) {
                        z = true;
                    }
                    if (z) {
                        element2.getParentNode().removeChild(element2);
                    }
                }
            }
        }
    }

    private void cleanHeaders(Element element) {
        for (int i = 1; i < 7; i++) {
            NodeList elementsByTagName = element.getElementsByTagName("h" + i);
            for (int length = elementsByTagName.getLength() - 1; length >= 0; length--) {
                Element element2 = (Element) elementsByTagName.item(length);
                if (getClassIdWeight(element2) < 0 || getLinkDensity(element2) > 0.33d) {
                    element2.getParentNode().removeChild(element2);
                }
            }
        }
    }

    private void setReadability(Element element, float f) {
        element.setAttribute(READABILITY_ATTR, String.valueOf(f));
    }

    private boolean hasReadability(Element element) {
        return element.hasAttribute(READABILITY_ATTR);
    }

    private float getReadability(Element element) {
        if (hasReadability(element)) {
            return Float.parseFloat(element.getAttribute(READABILITY_ATTR));
        }
        return 0.0f;
    }

    public static void usageExample() throws Exception {
        ReadabilityContentExtractor readabilityContentExtractor = new ReadabilityContentExtractor();
        readabilityContentExtractor.setDocument("http://www.wired.com/gadgetlab/2010/05/iphone-4g-ads/", true);
        readabilityContentExtractor.getResultText();
        readabilityContentExtractor.getResultTitle();
    }
}
