package ws.palladian.classification.page;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashSet;
import org.apache.log4j.Logger;
import org.w3c.dom.Node;
import ws.palladian.helper.UrlHelper;
import ws.palladian.helper.date.DateHelper;
import ws.palladian.helper.html.XPathHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineAction;
import ws.palladian.retrieval.DocumentRetriever;

/* loaded from: input_file:ws/palladian/classification/page/DataHelper.class */
final class DataHelper {
    private static final String XML_PART_NAME = "part";
    private static final String URL_PART_NAME = "urls";

    DataHelper() {
    }

    public void breakODPFile() {
        String readLine;
        try {
            FileReader fileReader = new FileReader("data/temp/odp/content.rdf.u8");
            BufferedReader bufferedReader = new BufferedReader(fileReader);
            int i = 1;
            StringBuilder sb = new StringBuilder();
            boolean z = false;
            int i2 = 0;
            do {
                readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                sb.append(readLine).append("\n");
                if ((i > 0 && i % 100000 == 0) || z) {
                    z = true;
                    if (readLine.endsWith("</Topic>") || readLine.endsWith("</ExternalPage>")) {
                        sb.append("</RDF>");
                        int i3 = i2;
                        i2++;
                        FileHelper.writeToFile("data/temp/odp/part" + i3 + ".xml", sb);
                        System.out.println("save file number " + i2);
                        sb = new StringBuilder("<?xml version=\"1.0\" encoding=\"UTF-8\"?><RDF xmlns:r=\"http://www.w3.org/TR/RDF/\" xmlns:d=\"http://purl.org/dc/elements/1.0/\" xmlns=\"http://dmoz.org/rdf/\">\n");
                        z = false;
                    }
                }
                i++;
            } while (readLine != null);
            FileHelper.writeToFile("data/temp/odp/part" + i2 + ".xml", sb);
            fileReader.close();
            bufferedReader.close();
        } catch (FileNotFoundException e) {
            Logger.getRootLogger().error(e.getMessage());
        } catch (IOException e2) {
            Logger.getRootLogger().error(e2.getMessage());
        } catch (OutOfMemoryError e3) {
            Logger.getRootLogger().error(e3.getMessage());
        }
    }

    public void mergeURLFiles() {
        try {
            FileWriter fileWriter = new FileWriter("data/temp/odp/list.txt");
            for (File file : FileHelper.getFiles("data/temp/odp/", URL_PART_NAME)) {
                fileWriter.write(FileHelper.readFileToString("data/temp/odp/" + file.getName()));
                fileWriter.flush();
            }
            fileWriter.close();
        } catch (IOException e) {
            Logger.getRootLogger().error(e.getMessage());
        }
    }

    public void parseODP(String str) {
        long currentTimeMillis = System.currentTimeMillis();
        StringBuilder sb = new StringBuilder();
        DocumentRetriever documentRetriever = new DocumentRetriever();
        boolean z = str.equalsIgnoreCase("english") ? false : true;
        int i = 1;
        int i2 = 0;
        int i3 = 0;
        File[] files = FileHelper.getFiles("data/temp/odp/", XML_PART_NAME);
        for (File file : files) {
            for (Node node : XPathHelper.getNodes(documentRetriever.getWebDocument("data/temp/odp/" + file.getName()), "//TOPIC")) {
                if (node.getAttributes().getNamedItem("r:id") != null) {
                    String textContent = node.getAttributes().getNamedItem("r:id").getTextContent();
                    String[] split = textContent.split("/");
                    if (split.length >= 2 && ((!split[1].equalsIgnoreCase("world") && !z) || (z && split.length > 2 && split[2].equalsIgnoreCase(str)))) {
                        String substring = textContent.substring(4);
                        if (substring.startsWith("World")) {
                            substring = substring.substring(6);
                            if (substring.toLowerCase().startsWith(str.toLowerCase())) {
                                if (substring.length() > str.length() + 1) {
                                    substring = substring.substring(str.length() + 1);
                                }
                            }
                        }
                        Node nextSibling = node.getNextSibling();
                        while (true) {
                            nextSibling = nextSibling.getNextSibling();
                            if (nextSibling != null && (nextSibling.getNodeType() == 3 || nextSibling.getNodeName().equals("LINK"))) {
                                if (nextSibling.getNodeType() != 3 && nextSibling.getAttributes().getNamedItem("r:resource") != null) {
                                    sb.append(UrlHelper.getCleanUrl(nextSibling.getAttributes().getNamedItem("r:resource").getTextContent())).append(" ").append(substring).append("\n");
                                    i2++;
                                    if (i2 > 0 && i2 % 50000 == 0) {
                                        int i4 = i3;
                                        i3++;
                                        FileHelper.writeToFile("data/temp/odp/urls" + i4 + ".txt", sb);
                                        sb = new StringBuilder();
                                    }
                                }
                            }
                        }
                    }
                }
            }
            System.out.println("loaded document number " + i + " of " + files.length + " / " + i2 + " lines");
            i++;
        }
        int i5 = i3;
        int i6 = i3 + 1;
        FileHelper.writeToFile("data/temp/odp/urls" + i5 + ".txt", sb);
        mergeURLFiles();
        DateHelper.formatDuration(currentTimeMillis);
    }

    public void createRandomSample(String str, int i) {
        int numberOfLines = FileHelper.getNumberOfLines("data/temp/odp/" + str);
        StringBuilder sb = new StringBuilder();
        HashSet hashSet = new HashSet();
        while (hashSet.size() < i) {
            hashSet.add(Integer.valueOf(((int) (Math.random() * numberOfLines)) + 1));
        }
        final Object[] objArr = {sb, hashSet};
        FileHelper.performActionOnEveryLine("data/temp/odp/" + str, new LineAction() { // from class: ws.palladian.classification.page.DataHelper.1
            public void performAction(String str2, int i2) {
                if (((HashSet) objArr[1]).contains(Integer.valueOf(i2))) {
                    ((StringBuilder) objArr[0]).append(str2).append("\n");
                }
            }
        });
        FileHelper.writeToFile("data/temp/odp/" + FileHelper.appendToFileName(str, "_sample" + i), sb);
    }

    public void cleanup(boolean z) {
        if (z) {
            for (File file : FileHelper.getFiles("data/temp/odp/", XML_PART_NAME)) {
                file.delete();
            }
        }
        for (File file2 : FileHelper.getFiles("data/temp/odp/", URL_PART_NAME)) {
            file2.delete();
        }
    }

    public static void main(String[] strArr) {
        DataHelper dataHelper = new DataHelper();
        dataHelper.createRandomSample("list_german.txt", 20000);
        dataHelper.cleanup(false);
    }
}
