package ws.palladian.classification;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ws.palladian.classification.text.evaluation.Dataset;
import ws.palladian.classification.utils.ClassificationUtils;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.collection.Bag;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineAction;
import ws.palladian.helper.math.MathHelper;

/* loaded from: input_file:ws/palladian/classification/DatasetManager.class */
public final class DatasetManager {
    private static final Logger LOGGER = LoggerFactory.getLogger(DatasetManager.class);

    private DatasetManager() {
    }

    public static void createIndex(String str) throws IOException {
        createIndex(str, null);
    }

    /* JADX WARN: String concatenation convert failed
    jadx.core.utils.exceptions.JadxRuntimeException: Can't remove SSA var: r8v0 java.lang.String, still in use, count: 1, list:
      (r8v0 java.lang.String) from STR_CONCAT 
      (r8v0 java.lang.String)
      ("_")
      (wrap:java.lang.String:0x0025: INVOKE (r6v0 java.lang.String[]) STATIC call: java.util.Arrays.toString(java.lang.Object[]):java.lang.String A[MD:(java.lang.Object[]):java.lang.String (c), WRAPPED])
     A[MD:():java.lang.String (c), SYNTHETIC, WRAPPED]
    	at jadx.core.utils.InsnRemover.removeSsaVar(InsnRemover.java:151)
    	at jadx.core.utils.InsnRemover.unbindResult(InsnRemover.java:116)
    	at jadx.core.utils.InsnRemover.unbindInsn(InsnRemover.java:80)
    	at jadx.core.utils.InsnRemover.unbindArgUsage(InsnRemover.java:163)
    	at jadx.core.utils.InsnRemover.unbindAllArgs(InsnRemover.java:95)
    	at jadx.core.utils.InsnRemover.unbindInsn(InsnRemover.java:79)
    	at jadx.core.utils.InsnRemover.unbindArgUsage(InsnRemover.java:163)
    	at jadx.core.utils.InsnRemover.unbindAllArgs(InsnRemover.java:95)
    	at jadx.core.utils.InsnRemover.unbindInsn(InsnRemover.java:79)
    	at jadx.core.utils.InsnRemover.unbindArgUsage(InsnRemover.java:163)
    	at jadx.core.utils.InsnRemover.unbindAllArgs(InsnRemover.java:95)
    	at jadx.core.utils.InsnRemover.unbindInsn(InsnRemover.java:79)
    	at jadx.core.utils.InsnRemover.unbindArgUsage(InsnRemover.java:163)
    	at jadx.core.utils.InsnRemover.unbindAllArgs(InsnRemover.java:95)
    	at jadx.core.dex.visitors.SimplifyVisitor.removeStringBuilderInsns(SimplifyVisitor.java:495)
    	at jadx.core.dex.visitors.SimplifyVisitor.convertStringBuilderChain(SimplifyVisitor.java:422)
    	at jadx.core.dex.visitors.SimplifyVisitor.convertInvoke(SimplifyVisitor.java:314)
    	at jadx.core.dex.visitors.SimplifyVisitor.simplifyInsn(SimplifyVisitor.java:145)
    	at jadx.core.dex.visitors.SimplifyVisitor.simplifyBlock(SimplifyVisitor.java:86)
    	at jadx.core.dex.visitors.SimplifyVisitor.visit(SimplifyVisitor.java:71)
     */
    public static String createIndex(String str, String[] strArr) throws IOException {
        String str2;
        StopWatch stopWatch = new StopWatch();
        String addTrailingSlash = FileHelper.addTrailingSlash(str);
        r0 = new StringBuilder().append(strArr != null ? str2 + "_" + Arrays.toString(strArr) : "index").append(".txt").toString();
        FileWriter fileWriter = new FileWriter(addTrailingSlash + r0);
        for (File file : FileHelper.getFilesAndDirectories(addTrailingSlash)) {
            if (!file.isFile()) {
                String folderName = FileHelper.getFolderName(file.getPath());
                if (strArr == null || Arrays.asList(strArr).contains(folderName)) {
                    for (File file2 : FileHelper.getFiles(file.getPath())) {
                        if (!file2.isDirectory()) {
                            fileWriter.write(folderName + "/" + file2.getName() + " " + folderName);
                            fileWriter.write("\n");
                            fileWriter.flush();
                        }
                    }
                } else {
                    LOGGER.info("skip class " + folderName);
                }
            }
        }
        fileWriter.close();
        LOGGER.info("index file created in " + stopWatch.getElapsedTimeString());
        return r0;
    }

    public static String createIndexExcerpt(String str, final String str2, final int i) throws IOException {
        StopWatch stopWatch = new StopWatch();
        String appendToFileName = FileHelper.appendToFileName(str, "_ipc" + i);
        final FileWriter fileWriter = new FileWriter(appendToFileName);
        final Bag create = Bag.create();
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.classification.DatasetManager.1
            public void performAction(String str3, int i2) {
                String[] split = str3.split(str2);
                if (split.length >= 2 && create.count(split[split.length - 1]) < i) {
                    try {
                        fileWriter.write(str3 + "\n");
                    } catch (IOException e) {
                        DatasetManager.LOGGER.error(e.getMessage());
                    }
                    create.add(split[split.length - 1]);
                }
            }
        });
        fileWriter.close();
        FileHelper.shuffleLines(appendToFileName);
        LOGGER.info("index excerpt file created in " + stopWatch.getElapsedTimeString());
        return appendToFileName;
    }

    public static String createBalancedIndex(String str, String str2) throws IOException {
        Dataset dataset = new Dataset();
        dataset.setSeparationString(str2);
        dataset.setPath(str);
        Map.Entry min = calculateClassDistribution(dataset).getMin();
        return createIndexExcerpt(str, str2, min != null ? ((Integer) min.getValue()).intValue() : 0);
    }

    public static String createIndexExcerptRandom(String str, final String str2, int i) throws IOException {
        StopWatch stopWatch = new StopWatch();
        String appendToFileName = FileHelper.appendToFileName(str, "_random" + i);
        final FileWriter fileWriter = new FileWriter(appendToFileName);
        final Set createRandomNumbers = MathHelper.createRandomNumbers(i, 0, FileHelper.getNumberOfLines(str));
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.classification.DatasetManager.2
            public void performAction(String str3, int i2) {
                if (str3.split(str2).length < 2) {
                    return;
                }
                if (createRandomNumbers.size() <= 0 || createRandomNumbers.contains(Integer.valueOf(i2))) {
                    try {
                        fileWriter.write(str3 + "\n");
                    } catch (IOException e) {
                        DatasetManager.LOGGER.error(e.getMessage());
                    }
                }
            }
        });
        fileWriter.close();
        LOGGER.info("index excerpt file created in " + stopWatch.getElapsedTimeString());
        return appendToFileName;
    }

    public static List<String[]> splitForCrossValidation(Dataset dataset, int i, int i2) throws IOException {
        ArrayList arrayList = new ArrayList();
        if (i2 > 0) {
            String createIndexExcerpt = createIndexExcerpt(dataset.getPath(), dataset.getSeparationString(), i2);
            Dataset dataset2 = new Dataset(dataset);
            dataset2.setPath(createIndexExcerpt);
            dataset = dataset2;
        }
        List<String> readFileToArray = FileHelper.readFileToArray(dataset.getPath());
        int size = readFileToArray.size() / i;
        for (int i3 = 1; i3 <= i; i3++) {
            int i4 = (i3 - 1) * size;
            int i5 = i3 * size;
            StringBuilder sb = new StringBuilder();
            StringBuilder sb2 = new StringBuilder();
            int i6 = 0;
            for (String str : readFileToArray) {
                if (i6 < i4 || i6 >= i5) {
                    sb2.append(str).append("\n");
                } else {
                    sb.append(str).append("\n");
                }
                i6++;
            }
            String str2 = dataset.getRootPath() + dataset.getName() + "_crossValidation_training" + i3 + ".txt";
            String str3 = dataset.getRootPath() + dataset.getName() + "_crossValidation_test" + i3 + ".txt";
            FileHelper.writeToFile(str2, sb);
            FileHelper.writeToFile(str3, sb2);
            arrayList.add(new String[]{str2, str3});
        }
        return arrayList;
    }

    public static String[] splitIndex(String str, int i) throws IOException {
        return splitIndex(str, i, " ");
    }

    public static String[] splitIndex(String str, int i, String str2) throws IOException {
        StopWatch stopWatch = new StopWatch();
        String appendToFileName = FileHelper.appendToFileName(str, "_split1");
        String appendToFileName2 = FileHelper.appendToFileName(str, "_split2");
        FileWriter fileWriter = new FileWriter(appendToFileName);
        FileWriter fileWriter2 = new FileWriter(appendToFileName2);
        HashMap hashMap = new HashMap();
        Iterator it = FileHelper.readFileToArray(str).iterator();
        while (it.hasNext()) {
            String[] split = ((String) it.next()).split(str2);
            Set set = (Set) hashMap.get(split[1]);
            if (set == null) {
                HashSet hashSet = new HashSet();
                hashSet.add(split[0]);
                hashMap.put(split[1], hashSet);
            } else {
                set.add(split[0]);
            }
        }
        for (Map.Entry entry : hashMap.entrySet()) {
            Set<String> set2 = (Set) entry.getValue();
            int size = (int) ((set2.size() * i) / 100.0d);
            int i2 = 0;
            for (String str3 : set2) {
                if (i2 < size) {
                    fileWriter.write(str3);
                    fileWriter.write(str2);
                    fileWriter.write((String) entry.getKey());
                    fileWriter.write("\n");
                    fileWriter.flush();
                    i2++;
                } else {
                    fileWriter2.write(str3);
                    fileWriter2.write(str2);
                    fileWriter2.write((String) entry.getKey());
                    fileWriter2.write("\n");
                    fileWriter2.flush();
                }
            }
        }
        fileWriter.close();
        fileWriter2.close();
        LOGGER.info("file " + str + " splitted in " + stopWatch.getElapsedTimeString());
        return new String[]{appendToFileName, appendToFileName2};
    }

    public static void splitIndexParts(String str) throws IOException {
        StopWatch stopWatch = new StopWatch();
        HashMap hashMap = new HashMap();
        for (String str2 : FileHelper.readFileToArray(str)) {
            if (str2.length() != 0) {
                String[] split = str2.split(" ");
                String substring = split[1].substring(split[1].lastIndexOf("_") + 1);
                Set set = (Set) hashMap.get(substring);
                if (set == null) {
                    HashSet hashSet = new HashSet();
                    hashSet.add(split[0]);
                    hashMap.put(substring, hashSet);
                } else {
                    set.add(split[0]);
                }
            }
        }
        for (Map.Entry entry : hashMap.entrySet()) {
            FileWriter fileWriter = new FileWriter(FileHelper.appendToFileName(str, "_part" + ((String) entry.getKey()).substring(((String) entry.getKey()).lastIndexOf("part") + 4)));
            for (String str3 : (Set) entry.getValue()) {
                String substring2 = str3.substring(0, str3.indexOf("/"));
                fileWriter.write(str3);
                fileWriter.write(" ");
                fileWriter.write(substring2);
                fileWriter.write("\n");
                fileWriter.flush();
            }
            fileWriter.close();
        }
        LOGGER.info("file " + str + " splitted in " + stopWatch.getElapsedTimeString());
    }

    public static void cleanDataset(String str) {
        StopWatch stopWatch = new StopWatch();
        LOGGER.info("cleaning the dataset...");
        int i = 0;
        for (File file : FileHelper.getFiles(str)) {
            for (File file2 : FileHelper.getFiles(file.getPath())) {
                if (!file2.isDirectory() && file2.length() == 0) {
                    file2.delete();
                    i++;
                }
            }
        }
        LOGGER.info("dataset cleansed (" + i + " files deleted) in " + stopWatch.getElapsedTimeString());
    }

    public static void splitDataset(Dataset dataset, double d) {
        if (dataset.isFirstFieldLink()) {
            LOGGER.warn("can only split datasets which consist of one file");
            return;
        }
        if (d > 1.0d) {
            d /= 100.0d;
        }
        List readFileToArray = FileHelper.readFileToArray(dataset.getPath());
        Collections.shuffle(readFileToArray);
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        int size = (int) (d * readFileToArray.size());
        for (int i = 0; i < readFileToArray.size(); i++) {
            if (i < size) {
                arrayList.add(readFileToArray.get(i));
            } else {
                arrayList2.add(readFileToArray.get(i));
            }
        }
        FileHelper.writeToFile(dataset.getRootPath() + "training.csv", arrayList);
        FileHelper.writeToFile(dataset.getRootPath() + "test.csv", arrayList2);
    }

    public static Bag<String> calculateClassDistribution(Dataset dataset) {
        return calculateClassDistribution(dataset, null);
    }

    public static Bag<String> calculateClassDistribution(final Dataset dataset, String str) {
        final Bag<String> create = Bag.create();
        FileHelper.performActionOnEveryLine(dataset.getPath(), new LineAction() { // from class: ws.palladian.classification.DatasetManager.3
            public void performAction(String str2, int i) {
                String[] split = str2.split(Dataset.this.getSeparationString());
                if (split.length < 2) {
                    return;
                }
                create.add(split[split.length - 1]);
            }
        });
        if (str != null) {
            StringBuilder sb = new StringBuilder();
            Iterator it = create.iterator();
            while (it.hasNext()) {
                String str2 = (String) it.next();
                sb.append(str2).append(ClassificationUtils.DEFAULT_SEPARATOR).append(create.count(str2)).append("\n");
            }
            FileHelper.writeToFile(str, sb);
        }
        return create;
    }

    public static int countClasses(final Dataset dataset) {
        final HashSet hashSet = new HashSet();
        FileHelper.performActionOnEveryLine(dataset.getPath(), new LineAction() { // from class: ws.palladian.classification.DatasetManager.4
            public void performAction(String str, int i) {
                String[] split = str.split(Dataset.this.getSeparationString());
                if (split.length < 2) {
                    return;
                }
                hashSet.add(split[split.length - 1]);
            }
        });
        return hashSet.size();
    }

    public static Dataset filterLowFrequencyCategories(Dataset dataset, int i) {
        String separationString = dataset.getSeparationString();
        Dataset dataset2 = new Dataset();
        dataset2.setSeparationString(separationString);
        dataset2.setName(dataset.getName());
        Bag<String> calculateClassDistribution = calculateClassDistribution(dataset, "data/distributionFull.csv");
        StringBuilder sb = new StringBuilder();
        for (Map.Entry entry : calculateClassDistribution.unique()) {
            if (((Integer) entry.getValue()).intValue() >= i) {
                sb.append("#").append((String) entry.getKey()).append("#");
            }
        }
        ArrayList arrayList = new ArrayList();
        Iterator it = FileHelper.readFileToArray(dataset.getPath()).iterator();
        while (it.hasNext()) {
            String[] split = ((String) it.next()).split(separationString);
            if (sb.indexOf("#" + split[split.length - 1] + "#") > -1) {
                arrayList.add(split[0] + separationString + split[split.length - 1]);
            }
        }
        String appendToFileName = FileHelper.appendToFileName(dataset.getPath(), "_filtered");
        FileHelper.writeToFile(appendToFileName, arrayList);
        dataset2.setPath(appendToFileName);
        return dataset2;
    }

    public static void main(String[] strArr) throws IOException {
        createBalancedIndex("H:\\PalladianData\\Datasets\\LanguageDatasets\\Microblogging35Languages\\languageDocumentIndex.txt", " ");
        System.exit(0);
        cleanDataset("data/datasets/ner/www_test2/");
        createIndex("data/datasets/ner/www_test2/");
        splitIndex("data/datasets/ner/www_test2/index.txt", 50);
    }
}
