package ws.palladian.preprocessing.segmentation;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import ws.palladian.helper.UrlHelper;

/* loaded from: input_file:ws/palladian/preprocessing/segmentation/PageSegmenterHelper.class */
public class PageSegmenterHelper {
    public static Map<String, Integer> limitMap(Map<String, Integer> map, int i) {
        TreeMap treeMap = new TreeMap();
        int size = map.size() < i ? map.size() : i;
        for (int i2 = 0; i2 < size; i2++) {
            treeMap.put((String) map.keySet().toArray()[i2], (Integer) map.values().toArray()[i2]);
        }
        return treeMap;
    }

    public static int getNodeLevel(Node node) {
        int i = 0;
        if (node == null) {
            return 0;
        }
        while (node.getParentNode() != null) {
            node = node.getParentNode();
            i++;
        }
        return i;
    }

    public static Document transformNodeToDocument(Node node) {
        Element element = (Element) node;
        try {
            Document newDocument = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
            newDocument.appendChild(newDocument.importNode(element, true));
            return newDocument;
        } catch (ParserConfigurationException e) {
            throw new IllegalStateException("Encountered ParserConfigurationException: " + e.getMessage(), e);
        }
    }

    public static String getLabelOfURL(String str) {
        String domain = UrlHelper.getDomain(str);
        String cleanUrl = UrlHelper.getCleanUrl(str);
        String replace = cleanUrl.replace(UrlHelper.getCleanUrl(domain), "");
        cleanUrl.replace("/", "_").replaceAll("[[^\\w\\däüöÄÜÖ\\+\\- ]]", "_");
        String replaceAll = replace.replaceAll("[[^\\w\\däüöÄÜÖ\\+\\- ]]", "_");
        return (replaceAll.length() <= 3 || replaceAll.indexOf("_", 0) == replaceAll.lastIndexOf("_")) ? replaceAll.substring(replaceAll.indexOf("_", 0) + 1, replaceAll.length()) : replaceAll.substring(replaceAll.indexOf("_", 0) + 1, replaceAll.indexOf("_", 2));
    }

    public static List<String> listTags(String str) {
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        Matcher matcher = Pattern.compile("(\\<.*?>)", 34).matcher(str);
        while (matcher.find()) {
            String group = matcher.group();
            if (group.contains(" ")) {
                group = group.substring(0, group.indexOf(" ")) + ">";
                if (!group.contains("<!") && !group.contains("<html") && !group.contains("<head") && !group.contains("<title") && !group.contains("<body")) {
                }
            }
            if (!arrayList2.contains(group)) {
                arrayList2.add(group);
            }
            arrayList.add(group);
        }
        return arrayList;
    }
}
