package ws.palladian.extraction.date.getter;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.Text;
import ws.palladian.extraction.date.KeyWords;
import ws.palladian.extraction.date.comparators.ContentDateComparator;
import ws.palladian.extraction.date.comparators.DateComparator;
import ws.palladian.extraction.date.dates.ContentDate;
import ws.palladian.extraction.date.dates.MetaDate;
import ws.palladian.extraction.date.dates.UrlDate;
import ws.palladian.extraction.date.helper.DateExtractionHelper;
import ws.palladian.helper.constants.DateFormat;
import ws.palladian.helper.constants.RegExp;
import ws.palladian.helper.date.DateExactness;
import ws.palladian.helper.date.DateParser;
import ws.palladian.helper.date.ExtractedDate;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.html.XPathHelper;
import ws.palladian.helper.math.MathHelper;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:ws/palladian/extraction/date/getter/ContentDateGetter.class */
public class ContentDateGetter extends TechniqueDateGetter<ContentDate> {
    private final MetaDateGetter metaDateGetter = new MetaDateGetter();
    private final UrlDateGetter urlDateGetter = new UrlDateGetter();

    @Override // ws.palladian.extraction.date.getter.TechniqueDateGetter
    public List<ContentDate> getDates(Document document) {
        List<ContentDate> contentDates = getContentDates(document);
        setFeatures(contentDates, document);
        return contentDates;
    }

    private void setFeatures(List<ContentDate> list, Document document) {
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        for (ContentDate contentDate : list) {
            if (contentDate.get(ContentDate.DATEPOS_IN_DOC) != -1) {
                arrayList.add(contentDate);
            }
            arrayList2.add(contentDate);
        }
        Collections.sort(arrayList, ContentDateComparator.INSTANCE);
        Collections.sort(arrayList2, new DateComparator());
        List<MetaDate> dates = this.metaDateGetter.getDates(document);
        List<UrlDate> dates2 = this.urlDateGetter.getDates(document.getDocumentURI());
        for (ContentDate contentDate2 : list) {
            contentDate2.setRelSize(1.0d / list.size());
            contentDate2.setOrdDocPos(MathHelper.round((arrayList.indexOf(contentDate2) + 1.0d) / arrayList.size(), 3));
            contentDate2.setOrdAgePos(MathHelper.round((arrayList2.indexOf(contentDate2) + 1.0d) / list.size(), 3));
            if (DateExtractionHelper.countDates(contentDate2, dates, DateExactness.DAY) > 0) {
                contentDate2.setInMetaDates(true);
            }
            if (DateExtractionHelper.countDates(contentDate2, dates2, DateExactness.DAY) > 0) {
                contentDate2.setInUrl(true);
            }
            contentDate2.setRelCntSame(MathHelper.round((DateExtractionHelper.countDates(contentDate2, list, DateExactness.DAY) + 1) / list.size(), 3));
            int indexOf = arrayList.indexOf(contentDate2);
            if (indexOf > 0) {
                contentDate2.setDistPosBefore(contentDate2.get(ContentDate.DATEPOS_IN_DOC) - ((ContentDate) arrayList.get(indexOf - 1)).get(ContentDate.DATEPOS_IN_DOC));
            }
            if (indexOf < arrayList.size() - 1) {
                contentDate2.setDistPosAfter(((ContentDate) arrayList.get(indexOf + 1)).get(ContentDate.DATEPOS_IN_DOC) - contentDate2.get(ContentDate.DATEPOS_IN_DOC));
            }
            int indexOf2 = arrayList2.indexOf(contentDate2);
            if (indexOf2 > 0) {
                contentDate2.setDistAgeBefore(Math.round(contentDate2.getDifference((ExtractedDate) arrayList2.get(indexOf2 - 1), TimeUnit.HOURS)));
            }
            if (indexOf2 < arrayList2.size() - 1) {
                contentDate2.setDistAgeAfter(Math.round(contentDate2.getDifference((ExtractedDate) arrayList2.get(indexOf2 + 1), TimeUnit.HOURS)));
            }
        }
    }

    private List<ContentDate> getContentDates(Document document) {
        ArrayList arrayList = new ArrayList();
        List<Node> nodes = XPathHelper.getNodes(document, "//text()");
        if (nodes.isEmpty()) {
            return arrayList;
        }
        String removeDoubleWhitespaces = StringHelper.removeDoubleWhitespaces(replaceHtmlSymbols(HtmlHelper.documentToReadableText(XPathHelper.getXhtmlNode(document, "//body"))));
        Map<Integer, String> findContentKeywords = findContentKeywords(removeDoubleWhitespaces);
        for (Node node : nodes) {
            if (node.getNodeType() == 3) {
                Node parentNode = node.getParentNode();
                String lowerCase = parentNode.getNodeName().toLowerCase();
                if (parentNode.getNodeType() != 8 && !Arrays.asList("script", "style").contains(lowerCase)) {
                    arrayList.addAll(checkTextNode((Text) node, removeDoubleWhitespaces, findContentKeywords));
                }
            }
        }
        return arrayList;
    }

    private List<ContentDate> checkTextNode(Text text, String str, Map<Integer, String> map) {
        String replaceHtmlSymbols = replaceHtmlSymbols(text.getNodeValue());
        Node parentNode = text.getParentNode();
        while (HtmlHelper.isSimpleElement(parentNode)) {
            parentNode = parentNode.getParentNode();
        }
        List<ContentDate> findAllDates = findAllDates(replaceHtmlSymbols);
        int indexOf = findAllDates.size() > 0 ? str.indexOf(replaceHtmlSymbols) : -1;
        for (ContentDate contentDate : findAllDates) {
            boolean z = StructureDateGetter.getDate(parentNode) != null;
            if (!z && parentNode != parentNode) {
                z |= StructureDateGetter.getDate(parentNode) != null;
            }
            contentDate.setHasStructureDate(z);
            boolean z2 = true;
            contentDate.setTag(parentNode.getNodeName());
            contentDate.setSimpleTag(HtmlHelper.isSimpleElement(parentNode));
            contentDate.setHTag(HtmlHelper.isHeadlineTag(parentNode));
            if (indexOf != -1) {
                int i = indexOf + contentDate.get(ContentDate.DATEPOS_IN_TAGTEXT);
                contentDate.setAbsDocPos(i);
                contentDate.setRelDocPos(MathHelper.round(i / str.length(), 3));
            }
            String nodeKeyword = getNodeKeyword(parentNode);
            if (nodeKeyword == null && parentNode != parentNode) {
                nodeKeyword = getNodeKeyword(parentNode);
            }
            if (nodeKeyword != null) {
                z2 = KeyWords.getKeywordPriority(nodeKeyword) == 3;
                contentDate.setKeyLoc(1);
            }
            if (nodeKeyword == null || z2) {
                setClosestKeyword(contentDate, str, map);
                if (contentDate.getKeyword() != null) {
                    contentDate.setKeyLoc(2);
                    nodeKeyword = contentDate.getKeyword();
                }
            }
            if (nodeKeyword != null) {
                contentDate.setKeyword(nodeKeyword);
                contentDate.setKeywordPriority(KeyWords.getKeywordPriority(nodeKeyword));
            }
        }
        return findAllDates;
    }

    private Map<Integer, String> findContentKeywords(String str) {
        HashMap hashMap = new HashMap();
        String lowerCase = str.toLowerCase();
        for (String str2 : KeyWords.BODY_CONTENT_KEYWORDS_ALL) {
            int indexOf = lowerCase.indexOf(str2);
            while (indexOf != -1) {
                hashMap.put(Integer.valueOf(indexOf), str2);
                indexOf = lowerCase.indexOf(str2);
                lowerCase = lowerCase.replaceFirst(str2, StringUtils.repeat('x', str2.length()));
            }
        }
        return hashMap;
    }

    private void setClosestKeyword(ContentDate contentDate, String str, Map<Integer, String> map) {
        int i = contentDate.get(ContentDate.DATEPOS_IN_DOC);
        if (i < 0) {
            return;
        }
        String str2 = null;
        int i2 = 0;
        int i3 = 0;
        int i4 = 1;
        while (true) {
            if (i4 >= 151) {
                break;
            }
            int i5 = i - i4;
            int i6 = i + i4;
            String str3 = map.get(Integer.valueOf(i5));
            if (str3 != null) {
                str2 = str3;
                i2 = i5 + str3.length();
                i3 = i;
                break;
            } else {
                String str4 = map.get(Integer.valueOf(i6));
                if (str4 != null) {
                    str2 = str4;
                    i2 = i + contentDate.getDateString().length();
                    i3 = i6;
                    break;
                }
                i4++;
            }
        }
        if (str2 != null) {
            contentDate.setKeyword(str2);
            int i7 = -1;
            if (i3 > i2) {
                i7 = StringHelper.countWhitespaces(str.substring(i2, i3));
            }
            if (i7 >= 30 || i7 == -1) {
                contentDate.setKeyDiff(0.0d);
            } else {
                contentDate.setKeyDiff(1.0d - MathHelper.round(i7 / 30.0d, 3));
            }
        }
    }

    private static String getNodeKeyword(Node node) {
        return KeyWords.searchKeyword(HtmlHelper.xmlToString(node), KeyWords.BODY_CONTENT_KEYWORDS_ALL);
    }

    static List<ContentDate> findAllDates(String str) {
        ArrayList arrayList = new ArrayList();
        for (DateFormat dateFormat : RegExp.ALL_DATE_FORMATS) {
            Matcher matcher = dateFormat.getPattern().matcher(str);
            while (matcher.find()) {
                int start = matcher.start();
                boolean isDigit = start > 0 ? Character.isDigit(str.charAt(start - 1)) : false;
                int end = matcher.end();
                if (end < str.length()) {
                    isDigit = Character.isDigit(str.charAt(end));
                }
                if (!isDigit) {
                    String group = matcher.group();
                    ContentDate contentDate = new ContentDate(DateParser.parseDate(group, dateFormat));
                    contentDate.setTagPos(str.indexOf(contentDate.getDateString()));
                    str = str.replaceFirst(group, StringUtils.repeat('x', group.length()));
                    arrayList.add(contentDate);
                }
            }
        }
        return arrayList;
    }

    private static String replaceHtmlSymbols(String str) {
        return StringHelper.removeDoubleWhitespaces(StringHelper.replaceProtectedSpace(StringEscapeUtils.unescapeHtml4(str))).replace("&#8203;", " ").replace("\n", " ").replace("&#09;", " ").replace("\t", " ").replace(" ,", " ");
    }
}
