package ws.palladian.retrieval.feeds.evaluation;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeSet;
import org.apache.commons.configuration.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ws.palladian.helper.ConfigHolder;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.math.MathHelper;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.persistence.DatabaseManagerFactory;
import ws.palladian.retrieval.feeds.Feed;
import ws.palladian.retrieval.feeds.FeedReader;
import ws.palladian.retrieval.feeds.FeedReaderSettings;
import ws.palladian.retrieval.feeds.persistence.FeedDatabase;
import ws.palladian.retrieval.feeds.persistence.FeedStore;
import ws.palladian.retrieval.feeds.updates.MavStrategyDatasetCreation;

/* loaded from: input_file:ws/palladian/retrieval/feeds/evaluation/DatasetCreator.class */
public class DatasetCreator {
    private static final Logger LOGGER = LoggerFactory.getLogger(DatasetCreator.class);
    public static final String DATASET_PATH = "data" + System.getProperty("file.separator") + "datasets" + System.getProperty("file.separator") + "feedPosts" + System.getProperty("file.separator");
    public static final int FILE_HANDLES_PER_TASK = 20;
    public static final boolean CHECK_SYSTEM_LIMITATIONS_DEFAULT = true;
    public static final String NO_TIMESTAMP = "0000000000000";
    public static final String NO_TITLE_REPLACEMENT = "\"###NO_TITLE###\"";
    public static final String NO_LINK_REPLACEMENT = "\"###NO_LINK###\"";

    public DatasetCreator() {
        detectSystemLimitations();
    }

    public static void cleanUp(boolean z) {
        StopWatch stopWatch = new StopWatch();
        String str = DATASET_PATH + "clean\\";
        File[] files = FileHelper.getFiles(DATASET_PATH);
        int length = files.length;
        int i = 0;
        int i2 = 0;
        for (File file : files) {
            i2++;
            if (file.length() >= 20 && file.length() <= 26214400 && !file.isDirectory()) {
                FileHelper.writeToFile(str + file.getName(), FileHelper.tryReadFileToString(file).replaceAll("(\t)+", "").replaceAll("\"(\n)+", "\"").replaceAll("(\n)+\"", "\"").replaceAll("(\n)(?!((.*?\\d;\")|(.*?MISS;)))", "").replaceAll("(?<=\"http([^\"]){0,200});(?=(.)+\")", ":"));
                HashSet hashSet = new HashSet();
                TreeSet treeSet = new TreeSet();
                for (String str2 : FileHelper.readFileToArray(str + file.getName())) {
                    if (str2.startsWith("MISS")) {
                        if (!z) {
                            treeSet.add(str2);
                        }
                    } else if (!str2.endsWith(";1")) {
                        if (str2.indexOf(";\"") == -1 || str2.lastIndexOf("\";") == -1) {
                            LOGGER.warn("bad format in file " + file.getName() + " skip cleaning this entry");
                        } else {
                            long parseLong = Long.parseLong(str2.substring(0, str2.indexOf(";")));
                            if (parseLong > System.currentTimeMillis() || parseLong < 946684800000L) {
                                LOGGER.info("timestamp " + parseLong + " is invalid, skip cleaning this entry");
                            } else {
                                if (parseLong < 1000000000000L) {
                                    str2 = str2.replaceFirst(String.valueOf(parseLong), "0" + parseLong);
                                }
                                if (hashSet.add(Integer.valueOf(str2.substring(str2.indexOf(";\"") + 2, str2.lastIndexOf("\";")).hashCode()))) {
                                    treeSet.add(str2);
                                }
                            }
                        }
                    }
                }
                StringBuilder sb = new StringBuilder();
                Iterator it = treeSet.iterator();
                while (it.hasNext()) {
                    sb.insert(0, ((String) it.next()) + "\n");
                }
                FileHelper.writeToFile(str + file.getName(), sb);
                if (i2 % 500 == 0) {
                    LOGGER.info(MathHelper.round((100.0d * i2) / length, 2) + "% of the files cleansed");
                }
            }
        }
        for (File file2 : FileHelper.getFiles(str)) {
            if (file2.length() == 0 && file2.delete()) {
                i++;
            }
        }
        LOGGER.info("finished in " + stopWatch.getElapsedTimeString() + ", deleted " + i + " files");
    }

    public static void combineFeedHistories() {
        StopWatch stopWatch = new StopWatch();
        String str = DATASET_PATH + "clean\\";
        FileWriter fileWriter = null;
        try {
            fileWriter = new FileWriter(str + "all.csv");
        } catch (IOException e) {
            LOGGER.error(e.getMessage());
        }
        int i = 0;
        for (File file : FileHelper.getFiles(str)) {
            String substring = file.getName().substring(0, file.getName().indexOf("_"));
            for (String str2 : FileHelper.readFileToArray(file)) {
                try {
                    fileWriter.write(substring + ";");
                    fileWriter.write(str2);
                    fileWriter.flush();
                } catch (IOException e2) {
                    LOGGER.error(file + ", " + e2.getMessage());
                }
            }
            i++;
            LOGGER.info("percent done: " + MathHelper.round((100 * i) / r0.length, 2));
        }
        try {
            fileWriter.close();
        } catch (Exception e3) {
            LOGGER.error(e3.getMessage());
        }
        LOGGER.info("all files combined to all.csv in " + stopWatch.getElapsedTimeString());
    }

    public static void renewFileIDs() {
        for (File file : FileHelper.getFiles("data/temp/feedPosts/")) {
            if (!file.isDirectory()) {
                String str = (Integer.parseInt(file.getName().substring(0, file.getName().indexOf("_"))) + 97650) + file.getName().substring(file.getName().indexOf("_"));
                LOGGER.info(str);
                FileHelper.copyFile("data/temp/feedPosts/" + file.getName(), "data/temp/feedPosts/" + str);
                FileHelper.delete("data/temp/feedPosts/" + file.getName());
            }
        }
    }

    public void createDataset() {
        FeedStore create = DatabaseManagerFactory.create(FeedDatabase.class, ConfigHolder.getInstance().getConfig());
        FeedReaderEvaluator.setBenchmarkPolicy(0);
        MavStrategyDatasetCreation mavStrategyDatasetCreation = new MavStrategyDatasetCreation(0, 360);
        DatasetProcessingAction datasetProcessingAction = new DatasetProcessingAction(create);
        FeedReaderSettings.Builder builder = new FeedReaderSettings.Builder();
        builder.setStore(create);
        builder.setAction(datasetProcessingAction);
        builder.setUpdateStrategy(mavStrategyDatasetCreation);
        FeedReader feedReader = new FeedReader(builder.m31create());
        LOGGER.debug("start reading feeds");
        feedReader.start();
    }

    public static String getCSVFilePath(int i, String str) {
        return getFolderPath(i) + i + "_" + str + ".csv";
    }

    public static String getFolderPath(int i) {
        return DATASET_PATH + getSlice(i) + System.getProperty("file.separator") + i + System.getProperty("file.separator");
    }

    public static int getSlice(int i) {
        return (int) Math.floor(i / 1000.0d);
    }

    public static String getSafeFeedName(String str) {
        return StringHelper.makeSafeName(str.replaceFirst("http://www.", "").replaceFirst("www.", ""), 30);
    }

    public static boolean createDirectoriesAndCSV(Feed feed) {
        String cSVFilePath = getCSVFilePath(feed.getId(), getSafeFeedName(feed.getFeedUrl()));
        return FileHelper.fileExists(cSVFilePath) ? true : FileHelper.createDirectoriesAndFile(cSVFilePath);
    }

    private void detectSystemLimitations() {
        Configuration config = ConfigHolder.getInstance().getConfig();
        boolean z = true;
        if (config != null) {
            z = config.getBoolean("feedReader.checkSystemLimitations", true);
        }
        if (!z) {
            LOGGER.warn("You skipped checking system for limitations. Good luck!");
            return;
        }
        if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
            LOGGER.info("It seems that you are running ths application on a non-linux machine. Make sure you have enough file descriptors :)");
            return;
        }
        String str = "";
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(Runtime.getRuntime().exec("/bin/sh ulimit -n").getInputStream()));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                } else {
                    str = str + readLine;
                }
            }
        } catch (IOException e) {
            LOGGER.error("Could not get number of available file handles: " + e.getLocalizedMessage());
        }
        int i = 0;
        if (!str.equals("")) {
            try {
                LOGGER.info("ulimit -n: " + str);
                i = Integer.parseInt(str);
            } catch (NumberFormatException e2) {
                LOGGER.error("Could not process number of available file handles: " + e2.getLocalizedMessage() + "\nMake sure you have at least 20 times more file handles than FeedReader-threads.\nCheck palladian.properties > feedReader.threadPoolSize to get number of threads.\nRun ulimit -n in a terminal to see the current soft limit of file descriptors for one session.\nRun cat /proc/sys/fs/file-max to display maximum number of open file descriptors.\nTo increase the number of file descriptors, modify /etc/security/limits.conf (su required), add\n<username> soft nofile <minimum-required-size>\n<username> hard nofile <minimum-required-size>+1024\nexamplefeeduser soft nofile 31744\nfeeduser hard nofile 32768\nRestart your system afterwards or find out which process needs to be restartet to let the changes take effect.\nSee http://www.cyberciti.biz/faq/linux-increase-the-maximum-number-of-open-files/ for more details.");
            }
        }
        if (i <= 0) {
            LOGGER.error("Illegal number of file descriptors: " + i + ". Make sure you have at least 20 times more file handles than FeedReader-threads.\nCheck palladian.properties > feedReader.threadPoolSize to get number of threads.\nRun ulimit -n in a terminal to see the current soft limit of file descriptors for one session.\nRun cat /proc/sys/fs/file-max to display maximum number of open file descriptors.\nTo increase the number of file descriptors, modify /etc/security/limits.conf (su required), add\n<username> soft nofile <minimum-required-size>\n<username> hard nofile <minimum-required-size>+1024\nexamplefeeduser soft nofile 31744\nfeeduser hard nofile 32768\nRestart your system afterwards or find out which process needs to be restartet to let the changes take effect.\nSee http://www.cyberciti.biz/faq/linux-increase-the-maximum-number-of-open-files/ for more details.");
            return;
        }
        int i2 = 0;
        if (config != null) {
            i2 = config.getInteger("feedReader.threadPoolSize", Integer.valueOf(FeedReaderSettings.DEFAULT_NUM_THREADS)).intValue();
        }
        if (i2 * 20 <= i) {
            LOGGER.error("More file handles required! \nthreadPoolSize=" + i2 + ", available file descriptors=" + i + ", minimum required file descriptors would be " + (i2 * 20) + "\nMake sure you have at least 20 times more file handles than FeedReader-threads.\nCheck palladian.properties > feedReader.threadPoolSize to get number of threads.\nRun ulimit -n in a terminal to see the current soft limit of file descriptors for one session.\nRun cat /proc/sys/fs/file-max to display maximum number of open file descriptors.\nTo increase the number of file descriptors, modify /etc/security/limits.conf (su required), add\n<username> soft nofile <minimum-required-size>\n<username> hard nofile <minimum-required-size>+1024\nexamplefeeduser soft nofile 31744\nfeeduser hard nofile 32768\nRestart your system afterwards or find out which process needs to be restartet to let the changes take effect.\nSee http://www.cyberciti.biz/faq/linux-increase-the-maximum-number-of-open-files/ for more details.");
            System.exit(-1);
        }
    }

    public static void main(String[] strArr) {
        new DatasetCreator().createDataset();
    }
}
