package ws.palladian.extraction.content;

import java.awt.image.BufferedImage;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import ws.palladian.core.Instance;
import ws.palladian.extraction.date.PageDateType;
import ws.palladian.extraction.date.WebPageDateEvaluator;
import ws.palladian.extraction.multimedia.ImageHandler;
import ws.palladian.extraction.token.Tokenizer;
import ws.palladian.helper.UrlHelper;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.constants.Language;
import ws.palladian.helper.date.ExtractedDate;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.html.XPathHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.math.MathHelper;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.retrieval.DocumentRetriever;
import ws.palladian.retrieval.ImageSizeComparator;
import ws.palladian.retrieval.PageAnalyzer;
import ws.palladian.retrieval.XPathSet;
import ws.palladian.retrieval.parser.json.JsonArray;
import ws.palladian.retrieval.parser.json.JsonException;
import ws.palladian.retrieval.resources.BasicWebImage;
import ws.palladian.retrieval.resources.WebImage;

/* loaded from: input_file:ws/palladian/extraction/content/PalladianContentExtractor.class */
public class PalladianContentExtractor extends WebPageContentExtractor {
    private static final Logger LOGGER = LoggerFactory.getLogger(PalladianContentExtractor.class);
    private static final List<String> MAIN_NODE_HINTS = new ArrayList();
    private Document document;
    private Node resultNode;
    private Node outerResultNode;
    private List<String> sentences = new ArrayList();
    private List<String> comments = new ArrayList();
    private String mainContentHtml = Instance.NO_CATEGORY_DUMMY;
    private String mainContentText = Instance.NO_CATEGORY_DUMMY;
    private String fullTextContent = Instance.NO_CATEGORY_DUMMY;
    private static final int DEFAULT_IMAGE_CONTAINER_SIZE = 500;
    private List<WebImage> imageUrls;

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public PalladianContentExtractor setDocument(Document document) throws PageContentExtractorException {
        setDocumentOnly(document);
        parseDocument();
        return this;
    }

    public PalladianContentExtractor setDocumentOnly(Document document) throws PageContentExtractorException {
        this.document = document;
        this.imageUrls = null;
        this.resultNode = null;
        this.outerResultNode = null;
        this.sentences = new ArrayList();
        this.comments = new ArrayList();
        this.mainContentHtml = Instance.NO_CATEGORY_DUMMY;
        this.mainContentText = Instance.NO_CATEGORY_DUMMY;
        this.fullTextContent = Instance.NO_CATEGORY_DUMMY;
        return this;
    }

    public Document getDocument() {
        return this.document;
    }

    public List<String> getSentences() {
        return this.sentences;
    }

    public List<String> getComments() {
        return this.comments;
    }

    private String cleanXPath(String str) {
        String replace = str.replaceAll("/text(\\[.*?\\])?", "/").replace("html/body", Instance.NO_CATEGORY_DUMMY).replace("xhtml:html/xhtml:body", Instance.NO_CATEGORY_DUMMY).replace("///", "//");
        if (replace.isEmpty() || replace.equals("//")) {
            replace = "//body";
        }
        if (replace.endsWith("//")) {
            replace = replace.substring(0, replace.length() - 2);
        }
        return replace;
    }

    public String getEntireTextContent() {
        this.fullTextContent = this.fullTextContent.replaceAll("(\t)+", Instance.NO_CATEGORY_DUMMY);
        this.fullTextContent = Pattern.compile("^.{0,40}$", 8).matcher(this.fullTextContent).replaceAll("\n");
        this.fullTextContent = this.fullTextContent.replaceAll("\n(\\s)+\n", "\n\n");
        this.fullTextContent = this.fullTextContent.replaceAll("(\n){2,}", "\n\n");
        return this.fullTextContent;
    }

    private void parseDocument() throws PageContentExtractorException {
        boolean z = false;
        String str = Instance.NO_CATEGORY_DUMMY;
        String str2 = Instance.NO_CATEGORY_DUMMY;
        this.resultNode = getMainContentNodeWithHints();
        int i = 0;
        if (this.resultNode != null) {
            str2 = XPathHelper.addXhtmlNsToXPath(getDocument(), PageAnalyzer.constructXPath(this.resultNode));
            str = str2;
            i = countDirectTextNodes();
            LOGGER.debug("direct text nodes: " + i);
            this.outerResultNode = this.resultNode;
        }
        this.fullTextContent = HtmlHelper.documentToText(this.document);
        cleanDom();
        this.sentences = Tokenizer.getSentences(HtmlHelper.documentToText(this.document), true);
        XPathSet xPathSet = new XPathSet();
        Iterator it = new HashSet(this.sentences).iterator();
        while (it.hasNext()) {
            Iterator it2 = PageAnalyzer.constructAllXPaths(getDocument(), (String) it.next()).iterator();
            while (it2.hasNext()) {
                xPathSet.add(PageAnalyzer.removeXPathIndicesFromLastCountNode((String) it2.next()));
            }
        }
        Map xPathMap = xPathSet.getXPathMap();
        String highestCountXPath = xPathSet.getHighestCountXPath();
        int countOfXPath = xPathSet.getCountOfXPath(highestCountXPath);
        HashSet hashSet = new HashSet();
        if (!str2.isEmpty()) {
            for (Map.Entry entry : xPathMap.entrySet()) {
                if (!((String) entry.getKey()).startsWith(str2)) {
                    hashSet.add(entry.getKey());
                }
            }
            Iterator it3 = hashSet.iterator();
            while (it3.hasNext()) {
                xPathSet.remove((String) it3.next());
            }
            if (xPathSet.isEmpty()) {
                z = true;
            } else {
                highestCountXPath = xPathSet.getHighestCountXPath();
                countOfXPath = xPathSet.getCountOfXPath(highestCountXPath);
                if (i > 3) {
                    z = true;
                }
            }
        }
        String str3 = highestCountXPath;
        if (z) {
            str = str2;
        } else {
            for (Map.Entry entry2 : xPathMap.entrySet()) {
                if (((String) entry2.getKey()).length() < str3.length() && ((Integer) entry2.getValue()).intValue() == countOfXPath) {
                    str3 = (String) entry2.getKey();
                }
            }
        }
        if (str3.isEmpty()) {
            z = true;
        }
        String findLastBoxSection = PageAnalyzer.findLastBoxSection(str3);
        if (!z) {
            str = XPathHelper.getParentXPath(findLastBoxSection);
        }
        String cleanXPath = cleanXPath(str);
        this.resultNode = XPathHelper.getXhtmlNode(getDocument(), cleanXPath);
        if (this.resultNode == null) {
            String replaceAll = cleanXPath.replaceAll("/[^x].*?:.*?/", "//");
            this.resultNode = XPathHelper.getXhtmlNode(getDocument(), replaceAll);
            if (this.resultNode == null) {
                this.resultNode = XPathHelper.getXhtmlNode(getDocument(), XPathHelper.addXhtmlNsToXPath(replaceAll));
                if (this.resultNode == null) {
                    this.mainContentText = this.fullTextContent;
                    return;
                }
            }
        }
        if (!z) {
            String addHeadlineSiblings = addHeadlineSiblings(findLastBoxSection);
            StringBuilder sb = new StringBuilder();
            Iterator it4 = XPathHelper.getXhtmlNodes(getDocument(), addHeadlineSiblings).iterator();
            while (it4.hasNext()) {
                String textContent = ((Node) it4.next()).getTextContent();
                if (!textContent.isEmpty()) {
                    sb.append(textContent).append("\n\n");
                }
            }
            this.mainContentText = sb.toString();
        }
        this.mainContentHtml = HtmlHelper.xmlToString(this.resultNode, true);
        if (this.mainContentText.trim().length() < 100) {
            this.mainContentText = HtmlHelper.documentToReadableText(this.resultNode);
        }
        if (this.mainContentText.trim().length() < 100) {
            this.mainContentText = this.fullTextContent;
        }
    }

    private int countDirectTextNodes() {
        int i = 0;
        Iterator it = XPathHelper.getXhtmlNodes(this.resultNode, "./text()").iterator();
        while (it.hasNext()) {
            String trim = ((Node) it.next()).getTextContent().trim();
            if (trim.length() > 20 && !trim.startsWith("<!--")) {
                i++;
            }
        }
        return i;
    }

    private void cleanDom() {
        Node parentNode;
        removeCommentNodes();
        ArrayList<Node> arrayList = new ArrayList();
        arrayList.addAll(XPathHelper.getXhtmlNodes(this.document, "//header//*"));
        arrayList.addAll(XPathHelper.getXhtmlNodes(this.document, "//nav//*"));
        arrayList.addAll(XPathHelper.getXhtmlNodes(this.document, "//div[translate(@id,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')= 'head']//*"));
        arrayList.addAll(XPathHelper.getXhtmlNodes(this.document, "//div[translate(@id,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')= 'pageheader']//*"));
        arrayList.addAll(XPathHelper.getXhtmlNodes(this.document, "//div[translate(@id,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')= 'header']//*"));
        arrayList.addAll(XPathHelper.getXhtmlNodes(this.document, "//footer//*"));
        arrayList.addAll(XPathHelper.getXhtmlNodes(this.document, "//div[translate(@id,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')= 'foot']//*"));
        arrayList.addAll(XPathHelper.getXhtmlNodes(this.document, "//div[translate(@id,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')= 'footer']//*"));
        arrayList.addAll(XPathHelper.getXhtmlNodes(this.document, "//div[translate(@id,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')= 'pagefooter']//*"));
        arrayList.addAll(XPathHelper.getXhtmlNodes(this.document, "//div[translate(@id,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')= 'sidebar']//*"));
        arrayList.addAll(XPathHelper.getXhtmlNodes(this.document, "//*[(self::xhtml:style) or (self::xhtml:script) or (self::xhtml:iframe)]"));
        for (Node node : arrayList) {
            if (node != null && (parentNode = node.getParentNode()) != null) {
                parentNode.removeChild(node);
            }
        }
    }

    private void removeCommentNodes() {
        for (Node node : XPathHelper.getXhtmlNodes(this.document, "//*[(self::xhtml:div) or (self::xhtml:p) or (self::xhtml:section) or (self::xhtml:ol) or (self::xhtml:ul) or (self::xhtml:li)][@class='comment' or contains(@class,'comment ') or contains(@class,' comment') or contains(@class,'comments ') or contains(@class,' comments') or contains(@id,'comments') or @id='disqus_thread']")) {
            this.comments.add(HtmlHelper.documentToReadableText(node));
            node.getParentNode().removeChild(node);
        }
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v24, types: [java.util.List] */
    private Node getMainContentNodeWithHints() {
        Node node = null;
        Iterator<String> it = MAIN_NODE_HINTS.iterator();
        while (true) {
            if (!it.hasNext()) {
                break;
            }
            String next = it.next();
            ArrayList arrayList = new ArrayList();
            try {
                arrayList = XPathHelper.getXhtmlNodes(getDocument(), "//*[(self::xhtml:div) or (self::xhtml:p) or (self::xhtml:span)][@class='" + next + "' or contains(@class,'" + next + " ') or contains(@class,' " + next + "') or @itemprop='" + next + "' or @id='" + next + "']");
            } catch (Exception e) {
                e.printStackTrace();
            }
            if (!arrayList.isEmpty()) {
                node = (Node) arrayList.get(0);
                if (arrayList.size() > 1) {
                    node = node.getParentNode();
                }
            }
            if (node != null) {
                LOGGER.debug("found main node with hint: " + next);
                break;
            }
        }
        return node;
    }

    private String addHeadlineSiblings(String str) {
        try {
            String[] split = str.split("/");
            String str2 = split[split.length - 1];
            String str3 = Instance.NO_CATEGORY_DUMMY;
            if (str2.contains("xhtml")) {
                str3 = "xhtml:";
            }
            str = str.replaceAll(str2 + "$", "*[(self::" + str2 + ") or (self::" + str3 + "h1) or (self::" + str3 + "h2) or (self::" + str3 + "h3) or (self::" + str3 + "h4) or (self::" + str3 + "h5) or (self::" + str3 + "h6) or (self::" + str3 + "span) or (self::" + str3 + "ul) or (self::" + str3 + "ol) or (self::" + str3 + "blockquote)]");
        } catch (Exception e) {
        }
        return str;
    }

    public List<WebImage> getImages(String str) {
        ArrayList arrayList = new ArrayList();
        String lowerCase = str.toLowerCase();
        for (WebImage webImage : getImages()) {
            if (webImage.getFileType().toLowerCase().equalsIgnoreCase(lowerCase)) {
                arrayList.add(webImage);
            }
        }
        return arrayList;
    }

    public void filterBySize(List<WebImage> list, int i, int i2) {
        ArrayList arrayList = new ArrayList();
        for (WebImage webImage : list) {
            if (webImage.getWidth() < 0 || webImage.getHeight() < 0 || (webImage.getWidth() > 0 && webImage.getWidth() > i && webImage.getHeight() > 0 && webImage.getHeight() > i2)) {
                arrayList.add(webImage);
            }
        }
        list.clear();
        list.addAll(arrayList);
    }

    public void filterByFileType(List<WebImage> list, String... strArr) {
        ArrayList arrayList = new ArrayList();
        for (WebImage webImage : list) {
            for (String str : strArr) {
                if (webImage.getFileType().equalsIgnoreCase(str)) {
                    arrayList.add(webImage);
                }
            }
        }
        list.clear();
        list.addAll(arrayList);
    }

    public void filterByName(List<WebImage> list, String str) {
        ArrayList arrayList = new ArrayList();
        for (WebImage webImage : list) {
            if (str == null || str.isEmpty() || !webImage.getImageUrl().contains(str)) {
                arrayList.add(webImage);
            }
        }
        list.clear();
        list.addAll(arrayList);
    }

    public List<WebImage> getImages() {
        return this.outerResultNode != null ? getImages(this.outerResultNode, getDocument(), new HashSet()) : getImages(this.resultNode, getDocument(), new HashSet());
    }

    public List<WebImage> getImages(Node node) {
        return getImages(node, this.document, new HashSet());
    }

    public List<WebImage> getImages(Node node, Document document, Collection<Node> collection) {
        return getImages(node, document, ".//xhtml:img", collection);
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v101, types: [java.util.List] */
    public List<WebImage> getImages(Node node, Document document, String str, Collection<Node> collection) {
        Node namedItem;
        if (this.imageUrls != null) {
            return this.imageUrls;
        }
        this.imageUrls = new ArrayList();
        if (node == null) {
            return this.imageUrls;
        }
        String xhtmlNodeTextContent = XPathHelper.getXhtmlNodeTextContent(document, "//head/base/@href");
        ArrayList<Node> arrayList = new ArrayList();
        while (arrayList.isEmpty() && node != null) {
            arrayList = XPathHelper.getXhtmlNodes(node, str);
            node = node.getParentNode();
        }
        List xhtmlNodes = XPathHelper.getXhtmlNodes(document, "//header//img");
        xhtmlNodes.addAll(XPathHelper.getXhtmlNodes(document, "//div[@id='header']//img"));
        xhtmlNodes.addAll(XPathHelper.getXhtmlNodes(document, "//footer//img"));
        xhtmlNodes.addAll(XPathHelper.getXhtmlNodes(document, "//div[@id='footer']//img"));
        arrayList.removeAll(xhtmlNodes);
        for (Node node2 : arrayList) {
            try {
                if (!collection.contains(node2)) {
                    NamedNodeMap attributes = node2.getAttributes();
                    BasicWebImage.Builder builder = new BasicWebImage.Builder();
                    String textContent = attributes.getNamedItem("src").getTextContent();
                    if (!textContent.startsWith("http")) {
                        textContent = xhtmlNodeTextContent.isEmpty() ? UrlHelper.makeFullUrl(document.getDocumentURI(), (String) null, textContent) : UrlHelper.makeFullUrl(xhtmlNodeTextContent, (String) null, textContent);
                    }
                    builder.setImageUrl(textContent);
                    builder.setFileType(FileHelper.getFileType(textContent));
                    if (attributes.getNamedItem("alt") != null) {
                        builder.setSummary(attributes.getNamedItem("alt").getTextContent());
                    }
                    if (attributes.getNamedItem("title") != null) {
                        builder.setTitle(attributes.getNamedItem("title").getTextContent());
                    }
                    boolean z = false;
                    if (attributes.getNamedItem("width") != null) {
                        builder.setWidth(getImageSize(attributes.getNamedItem("width").getTextContent()));
                        z = true;
                    }
                    if (attributes.getNamedItem("height") != null) {
                        builder.setHeight(getImageSize(attributes.getNamedItem("height").getTextContent()));
                        z = true;
                    }
                    if (!z && (namedItem = attributes.getNamedItem("style")) != null) {
                        String textContent2 = namedItem.getTextContent();
                        String trim = StringHelper.getSubstringBetween(textContent2, "width:", "px").trim();
                        String trim2 = StringHelper.getSubstringBetween(textContent2, "height:", "px").trim();
                        if (!trim.isEmpty()) {
                            builder.setWidth((int) MathHelper.parseStringNumber(trim));
                        }
                        if (!trim2.isEmpty()) {
                            builder.setHeight((int) MathHelper.parseStringNumber(trim2));
                        }
                    }
                    this.imageUrls.add(builder.create());
                }
            } catch (NullPointerException e) {
                LOGGER.debug("an image has not all necessary attributes");
            } catch (NumberFormatException e2) {
                LOGGER.debug(e2.getMessage());
            }
        }
        return this.imageUrls;
    }

    private int getImageSize(String str) throws NumberFormatException {
        String replace = str.replace(",*", Instance.NO_CATEGORY_DUMMY);
        return replace.contains("%") ? (int) (0.01d * Integer.parseInt(StringHelper.trim(replace.replace("%", Instance.NO_CATEGORY_DUMMY))) * 500.0d) : Integer.parseInt(StringHelper.trim(replace.replace("px", Instance.NO_CATEGORY_DUMMY)));
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public Node getResultNode() {
        return this.resultNode;
    }

    public String getMainContentHtml() {
        return this.mainContentHtml;
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getResultText() {
        return this.mainContentText;
    }

    public String getSentencesString() {
        StringBuilder sb = new StringBuilder();
        Iterator<String> it = getSentences().iterator();
        while (it.hasNext()) {
            sb.append(it.next()).append(" ");
        }
        return sb.toString();
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getResultTitle() {
        return getResultTitle(new HashSet());
    }

    public String getResultTitle(Collection<String> collection) {
        List xhtmlNodes = XPathHelper.getXhtmlNodes(getDocument(), "//h1[not(ancestor::header) and not(ancestor::footer)]");
        Iterator<String> it = collection.iterator();
        while (it.hasNext()) {
            xhtmlNodes.removeAll(XPathHelper.getXhtmlNodes(getDocument(), it.next() + "//h1"));
        }
        Node node = collection.isEmpty() ? (Node) CollectionHelper.getLast(xhtmlNodes) : (Node) CollectionHelper.getFirst(xhtmlNodes);
        String str = Instance.NO_CATEGORY_DUMMY;
        if (node != null) {
            str = StringHelper.clean(node.getTextContent());
        }
        if (str.isEmpty()) {
            Node xhtmlNode = XPathHelper.getXhtmlNode(getDocument(), "//title");
            str = xhtmlNode != null ? xhtmlNode.getTextContent().replaceAll("\\|.*", Instance.NO_CATEGORY_DUMMY).trim() : StringHelper.getFirstWords(this.mainContentText, 20);
        }
        return str;
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getExtractorName() {
        return "Palladian";
    }

    public void analyzeImages() {
        ArrayList arrayList = new ArrayList();
        for (WebImage webImage : getImages()) {
            if (webImage.getWidth() == 0 || webImage.getHeight() == 0) {
                BufferedImage load = ImageHandler.load(webImage.getUrl());
                if (load != null) {
                    BasicWebImage.Builder builder = new BasicWebImage.Builder();
                    builder.setWebImage(webImage);
                    builder.setWidth(load.getWidth());
                    builder.setHeight(load.getHeight());
                    arrayList.add(builder.create());
                } else {
                    arrayList.add(webImage);
                }
            } else {
                arrayList.add(webImage);
            }
        }
        this.imageUrls = arrayList;
    }

    public String getAuthorName(String str) {
        String text = new DocumentRetriever().getText("http://webknox.com/api/webpage/author?url=" + getDocument().getDocumentURI() + "&language=en&apiKey=" + str);
        if (text != null && text.length() > 0) {
            try {
                return new JsonArray(text).getJsonObject(0).getString("name");
            } catch (JsonException e) {
            }
        }
        return Instance.NO_CATEGORY_DUMMY;
    }

    public ExtractedDate getPublishDate() {
        return WebPageDateEvaluator.getBestDate(this.document, PageDateType.PUBLISH);
    }

    public Language detectLanguage() {
        String lowerCase = HtmlHelper.getInnerXml(getDocument()).toLowerCase();
        String substringBetween = StringHelper.getSubstringBetween(lowerCase, " lang=\"", "\"");
        if (substringBetween.isEmpty()) {
            substringBetween = StringHelper.getSubstringBetween(lowerCase, " xml:lang=\"", "\"");
        }
        if (substringBetween.isEmpty()) {
            substringBetween = StringHelper.getSubstringBetween(lowerCase, " xmlu00003alang=\"", "\"");
        }
        if (substringBetween.isEmpty()) {
            substringBetween = StringHelper.getSubstringBetween(lowerCase, "<meta name=\"content-language\" content=\"", "\"");
        }
        if (substringBetween.isEmpty()) {
            substringBetween = StringHelper.getSubstringBetween(lowerCase, "<meta name=\"language\" content=\"", "\"");
        }
        if (substringBetween != null && !substringBetween.isEmpty() && substringBetween.length() < 6) {
            return Language.getByIso6391(substringBetween.split("[-:]")[0]);
        }
        String domain = UrlHelper.getDomain(getDocument().getDocumentURI());
        if (domain.endsWith(".de") || domain.endsWith(".at")) {
            return Language.GERMAN;
        }
        if (domain.endsWith(".fr")) {
            return Language.FRENCH;
        }
        if (domain.endsWith(".es")) {
            return Language.SPANISH;
        }
        if (domain.endsWith(".it")) {
            return Language.ITALIAN;
        }
        if (domain.endsWith(".co.uk") || domain.endsWith(".ac.uk") || domain.endsWith(".ac.za") || domain.endsWith(".ie") || domain.endsWith(".co.nz") || domain.endsWith(".co.za") || domain.endsWith(".au") || domain.endsWith(".ca") || domain.endsWith(".us")) {
            return Language.ENGLISH;
        }
        if (domain.endsWith(".pl")) {
            return Language.POLISH;
        }
        if (domain.endsWith(".dk")) {
            return Language.DANISH;
        }
        if (domain.endsWith(".co.jp")) {
            return Language.JAPANESE;
        }
        if (domain.endsWith(".pt")) {
            return Language.PORTUGUESE;
        }
        if (domain.endsWith(".nl")) {
            return Language.DUTCH;
        }
        if (domain.endsWith(".ru")) {
            return Language.RUSSIAN;
        }
        if (domain.endsWith(".no")) {
            return Language.NORWEGIAN;
        }
        if (domain.endsWith(".sk")) {
            return Language.SLOVAK;
        }
        if (domain.endsWith(".fi")) {
            return Language.FINNISH;
        }
        if (domain.endsWith(".gr")) {
            return Language.GREEK;
        }
        if (domain.endsWith(".co.il")) {
            return Language.HEBREW;
        }
        if (domain.endsWith(".vn")) {
            return Language.VIETNAMESE;
        }
        if (domain.endsWith(".se")) {
            return Language.SWEDISH;
        }
        return null;
    }

    public WebImage getDominantImage(Collection<String> collection, Collection<String> collection2) {
        Node xhtmlNode;
        Node xhtmlNode2;
        Node xhtmlNode3 = XPathHelper.getXhtmlNode(getDocument(), "//meta[@property='og:image']//@content");
        if (xhtmlNode3 != null) {
            return new BasicWebImage.Builder().setImageUrl(xhtmlNode3.getTextContent().trim()).create();
        }
        ArrayList arrayList = new ArrayList();
        if (collection2 != null && !collection2.isEmpty()) {
            Iterator<String> it = collection2.iterator();
            while (it.hasNext()) {
                arrayList.addAll(XPathHelper.getXhtmlNodes(getDocument(), it.next() + "//img"));
            }
        }
        Node xhtmlNode4 = XPathHelper.getXhtmlNode(getDocument(), "//*[(translate(@itemprop,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')= 'image' or translate(@id,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')= 'photo') and not(ancestor::header) and not(ancestor::footer)]");
        if (xhtmlNode4 != null && !arrayList.contains(xhtmlNode4) && (xhtmlNode2 = XPathHelper.getXhtmlNode(xhtmlNode4, ".//@src")) != null) {
            return new BasicWebImage.Builder().setImageUrl(UrlHelper.makeFullUrl(getDocument().getDocumentURI(), (String) null, xhtmlNode2.getTextContent().trim())).create();
        }
        Node xhtmlNode5 = XPathHelper.getXhtmlNode(getDocument(), "//img[(contains(@class,'main-photo') or contains(@class,'main-image')) and not(ancestor::header) and not(ancestor::footer)]");
        if (xhtmlNode5 != null && !arrayList.contains(xhtmlNode5) && (xhtmlNode = XPathHelper.getXhtmlNode(xhtmlNode5, ".//@src")) != null) {
            return new BasicWebImage.Builder().setImageUrl(UrlHelper.makeFullUrl(getDocument().getDocumentURI(), (String) null, xhtmlNode.getTextContent().trim())).create();
        }
        WebImage webImage = null;
        List<WebImage> arrayList2 = new ArrayList<>();
        Document document = getDocument();
        if (collection == null || collection.isEmpty()) {
            arrayList2.addAll(getImages(document, getDocument(), ".//img[not(ancestor::header) and not(ancestor::footer) and not(ancestor::a[contains(@href,'index') or @href=''])]", arrayList));
        } else {
            Iterator<String> it2 = collection.iterator();
            while (it2.hasNext()) {
                arrayList2.addAll(getImages(XPathHelper.getXhtmlNode(getDocument(), it2.next()), getDocument(), ".//img[not(ancestor::header) and not(ancestor::footer) and not(ancestor::a[contains(@href,'index') or @href=''])]", arrayList));
            }
        }
        filterByFileType(arrayList2, "jpeg", "png", "jpg");
        if (!arrayList2.isEmpty()) {
            HashMap hashMap = new HashMap();
            for (WebImage webImage2 : arrayList2) {
                hashMap.put(webImage2.getImageUrl(), webImage2);
            }
            List<WebImage> arrayList3 = new ArrayList<>((Collection<? extends WebImage>) hashMap.values());
            webImage = (WebImage) CollectionHelper.getFirst(arrayList3);
            if (webImage != null && webImage.getSize() < 10000) {
                filterByName(arrayList3, "icon");
                filterBySize(arrayList3, 50, 50);
                Collections.sort(arrayList3, new ImageSizeComparator());
                webImage = (WebImage) CollectionHelper.getFirst(arrayList3);
            }
        }
        return webImage;
    }

    public WebImage getDominantImage() {
        return getDominantImage(null, null);
    }

    public static void main(String[] strArr) throws PageContentExtractorException {
        PalladianContentExtractor palladianContentExtractor = new PalladianContentExtractor();
        palladianContentExtractor.setDocument(new DocumentRetriever().getWebDocument("http://janeshealthykitchen.com/instant-red-sauce/"));
        Language detectLanguage = palladianContentExtractor.detectLanguage();
        CollectionHelper.print(palladianContentExtractor.getImages());
        System.out.println(detectLanguage);
        System.exit(0);
        PalladianContentExtractor palladianContentExtractor2 = new PalladianContentExtractor();
        palladianContentExtractor2.setDocument("http://www.voanews.com/content/russia-urges-nations-to-take-active-role-in-the-middle-east-93610219/169955.html");
        System.out.println("Title: " + palladianContentExtractor2.getResultTitle());
        System.out.println("Result Text: " + palladianContentExtractor2.getResultText());
        System.out.println("Comments: ");
        CollectionHelper.print(palladianContentExtractor2.getComments());
        System.out.println("Full Text: " + palladianContentExtractor2.getEntireTextContent());
    }

    static {
        MAIN_NODE_HINTS.add("articleText");
        MAIN_NODE_HINTS.add("article_body");
        MAIN_NODE_HINTS.add("article-body");
        MAIN_NODE_HINTS.add("articleBody");
        MAIN_NODE_HINTS.add("hfeed");
        MAIN_NODE_HINTS.add("st_text_c");
    }
}
