package ws.palladian.extraction.entity.dataset;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ws.palladian.core.Annotation;
import ws.palladian.extraction.entity.Annotations;
import ws.palladian.extraction.entity.FileFormatParser;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.io.FileHelper;

/* loaded from: input_file:ws/palladian/extraction/entity/dataset/DatasetProcessor.class */
public class DatasetProcessor {
    private static final Logger LOGGER = LoggerFactory.getLogger(DatasetProcessor.class);

    public List<String> splitFile(String str, String str2, int i, int i2, int i3) {
        StopWatch stopWatch = new StopWatch();
        ArrayList arrayList = new ArrayList();
        String fileName = FileHelper.getFileName(str);
        String[] split = FileHelper.tryReadFileToString(str).split(str2);
        int i4 = i;
        while (true) {
            int i5 = i4;
            if (i5 > i2) {
                LOGGER.info("split file " + str + " in " + stopWatch.getElapsedTimeString());
                return arrayList;
            }
            StringBuilder sb = new StringBuilder();
            int i6 = 0;
            for (int i7 = 0; i7 < split.length; i7++) {
                if (i7 != 0) {
                    String str3 = split[i7];
                    if (i6 < i5) {
                        sb.append(str2);
                        sb.append(str3);
                        i6++;
                    }
                }
            }
            String str4 = FileHelper.getFilePath(str) + fileName + "_sep_" + i5 + ".txt";
            FileHelper.writeToFile(str4, sb);
            arrayList.add(str4);
            i4 = i5 + i3;
        }
    }

    public void splitFileByDocuments(String str, String str2) {
        StopWatch stopWatch = new StopWatch();
        String fileName = FileHelper.getFileName(str);
        String[] split = FileHelper.tryReadFileToString(str).split(str2);
        StringBuilder sb = new StringBuilder();
        StringBuilder sb2 = new StringBuilder();
        for (int i = 0; i < split.length; i++) {
            if (i != 0) {
                String str3 = split[i];
                if (i % 2 == 0) {
                    sb.append(str2);
                    sb.append(str3);
                } else {
                    sb2.append(str2);
                    sb2.append(str3);
                }
            }
        }
        FileHelper.writeToFile(FileHelper.getFilePath(str) + fileName + "_part1.txt", sb);
        FileHelper.writeToFile(FileHelper.getFilePath(str) + fileName + "_part2.txt", sb2);
        LOGGER.info("split file " + str + " in " + stopWatch.getElapsedTimeString());
    }

    public void createSubsets(String str, int i, int i2, int i3) {
        StopWatch stopWatch = new StopWatch();
        String[] split = FileHelper.tryReadFileToString(str).split("=-DOCSTART-");
        String str2 = FileHelper.getFilePath(str) + "columnInput.txt";
        FileFormatParser.xmlToColumn(str, str2, "\t");
        int i4 = i;
        while (true) {
            int i5 = i4;
            if (i5 > i2) {
                LOGGER.info("processing the dataset took " + stopWatch.getElapsedTimeString());
                return;
            }
            i3 = i5 == 1 ? i3 - 1 : i3;
            StringBuilder sb = new StringBuilder();
            Annotations<Annotation> seedAnnotations = FileFormatParser.getSeedAnnotations(str2, i5);
            for (String str3 : split) {
                boolean z = false;
                String stripHtmlTags = HtmlHelper.stripHtmlTags(str3);
                Iterator<T> it = seedAnnotations.iterator();
                while (it.hasNext()) {
                    Annotation annotation = (Annotation) it.next();
                    String upperCase = annotation.getTag().toUpperCase();
                    stripHtmlTags = stripHtmlTags.replace(annotation.getValue(), "<" + upperCase + ">" + annotation.getValue() + "</" + upperCase + ">");
                    if (stripHtmlTags.indexOf("<" + upperCase + ">") > -1) {
                        z = true;
                    }
                }
                if (z) {
                    sb.append("=-DOCSTART-");
                    sb.append(stripHtmlTags);
                }
            }
            String str4 = FileHelper.getFilePath(str) + "/newDataset" + i5 + ".txt";
            FileHelper.writeToFile(str4, sb);
            FileFormatParser.xmlToColumn(str4, str4, "\t");
            FileHelper.writeToFile(str4, FileHelper.tryReadFileToString(str4).replaceAll("=-\tO\nDOCSTART\tO\n-\tO", "=-DOCSTART-\tO"));
            i4 = i5 + i3;
        }
    }

    public static void main(String[] strArr) {
        DatasetProcessor datasetProcessor = new DatasetProcessor();
        FileFormatParser.columnToXml("data/temp/autoGeneratedDataConll/seedsTest50.txt", "data/temp/autoGeneratedDataConll/seedsTest50_temp.txt", "\t");
        datasetProcessor.createSubsets("data/temp/autoGeneratedDataConll/seedsTest50_temp.txt", 1, 50, 10);
    }
}
