package ws.palladian.extraction.entity;

import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.Validate;
import ws.palladian.core.Annotation;
import ws.palladian.core.ImmutableAnnotation;
import ws.palladian.core.Instance;
import ws.palladian.extraction.token.Tokenizer;
import ws.palladian.helper.collection.Bag;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineAction;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:ws/palladian/extraction/entity/FileFormatParser.class */
public final class FileFormatParser {

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:ws/palladian/extraction/entity/FileFormatParser$ColumnToXmlAction.class */
    public static final class ColumnToXmlAction extends LineAction {
        private final String columnSeparator;
        private final Writer writer;
        String currentTag;
        boolean previousLineBreak;
        boolean atBeginning;

        private ColumnToXmlAction(String str, Writer writer) {
            this.currentTag = "o";
            this.previousLineBreak = true;
            this.atBeginning = true;
            this.columnSeparator = str;
            this.writer = writer;
        }

        public void performAction(String str, int i) {
            try {
                String[] split = str.split(this.columnSeparator);
                if (split.length < 2 && this.atBeginning) {
                    this.atBeginning = false;
                    return;
                }
                this.atBeginning = false;
                if (str.contains("=-DOCSTART-")) {
                    return;
                }
                if (split.length < 2) {
                    if (str.length() == 0) {
                        if (!this.currentTag.equalsIgnoreCase("o") && i > 1) {
                            this.writer.write("</");
                            this.writer.write(this.currentTag);
                            this.writer.write(">");
                            this.currentTag = "o";
                        }
                        this.writer.write("\n");
                        this.previousLineBreak = true;
                        return;
                    }
                    return;
                }
                boolean z = false;
                String str2 = split[1];
                String str3 = split[0];
                if (!this.currentTag.equalsIgnoreCase(str2)) {
                    if (!this.currentTag.equalsIgnoreCase("o") && i > 1) {
                        this.writer.write("</");
                        this.writer.write(this.currentTag);
                        this.writer.write(">");
                    }
                    if (!str2.equalsIgnoreCase("o")) {
                        if (i > 1 && !this.previousLineBreak) {
                            this.writer.write(" ");
                        }
                        this.writer.write("<");
                        this.writer.write(str2);
                        this.writer.write(">");
                        z = true;
                    }
                }
                this.currentTag = str2;
                if (split.length > 0 && str3.length() > 0 && ((Character.isLetterOrDigit(str3.charAt(0)) || StringHelper.isBracket(str3.charAt(0))) && !z && i > 1 && !this.previousLineBreak)) {
                    this.writer.write(" ");
                }
                this.writer.write(str3);
                this.previousLineBreak = false;
            } catch (IOException e) {
                throw new IllegalStateException("Could not write", e);
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:ws/palladian/extraction/entity/FileFormatParser$XmlToColumnAction.class */
    public static final class XmlToColumnAction extends LineAction {
        private final String columnSeparator;
        private final Writer writer;

        private XmlToColumnAction(String str, Writer writer) {
            this.columnSeparator = str;
            this.writer = writer;
        }

        public void performAction(String str, int i) {
            try {
                String str2 = "O";
                for (String str3 : Tokenizer.tokenize(str)) {
                    if (str3.startsWith("</")) {
                        str2 = "O";
                    } else if (str3.startsWith("<")) {
                        str2 = StringHelper.getSubstringBetween(str3, "<", ">");
                    } else {
                        this.writer.write(str3);
                        this.writer.write(this.columnSeparator);
                        this.writer.write(str2);
                        this.writer.write(10);
                    }
                }
                this.writer.write(10);
            } catch (IOException e) {
                throw new IllegalStateException("Could not write", e);
            }
        }
    }

    private FileFormatParser() {
    }

    public static Set<String> getTagsFromColumnFile(String str, final String str2) {
        Validate.notEmpty(str, "trainingFilePath must not be empty", new Object[0]);
        Validate.notEmpty(str2, "separator must not be empty", new Object[0]);
        final HashSet hashSet = new HashSet();
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.extraction.entity.FileFormatParser.1
            public void performAction(String str3, int i) {
                if (str3.length() == 0) {
                    return;
                }
                String[] split = str3.split(str2);
                if (split.length != 2) {
                    return;
                }
                String str4 = split[split.length - 1];
                if ("O".equalsIgnoreCase(str4)) {
                    return;
                }
                hashSet.add(str4);
            }
        });
        return hashSet;
    }

    private static String getTextFromXML(String str) {
        return FileHelper.tryReadFileToString(str).replaceAll("</?[^>]+>", Instance.NO_CATEGORY_DUMMY);
    }

    public static String getText(String str, TaggingFormat taggingFormat) {
        if (taggingFormat.equals(TaggingFormat.XML)) {
            return getTextFromXML(str);
        }
        if (!taggingFormat.equals(TaggingFormat.COLUMN)) {
            throw new IllegalArgumentException("Unsupported format: " + taggingFormat);
        }
        String tempFile = getTempFile();
        columnToXml(str, tempFile, "\t");
        return getText(tempFile, TaggingFormat.XML);
    }

    private static String getTempFile() {
        return new File(FileHelper.getTempDir(), "text_" + UUID.randomUUID().toString() + ".txt").getPath();
    }

    public static void columnToXml(String str, String str2, String str3) {
        BufferedWriter bufferedWriter = null;
        try {
            try {
                bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(str2), "UTF-8"));
                FileHelper.performActionOnEveryLine(str, new ColumnToXmlAction(str3, bufferedWriter));
                FileHelper.close(new Closeable[]{bufferedWriter});
            } catch (IOException e) {
                throw new IllegalStateException("Could not write", e);
            }
        } catch (Throwable th) {
            FileHelper.close(new Closeable[]{bufferedWriter});
            throw th;
        }
    }

    public static void columnToXmlTokenBased(String str, String str2, final String str3) {
        final StringBuilder sb = new StringBuilder();
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.extraction.entity.FileFormatParser.2
            boolean previousLineBreak = true;

            public void performAction(String str4, int i) {
                if (str4.contains("=-DOCSTART-")) {
                    return;
                }
                String[] split = str4.split(str3);
                if (split.length < 2) {
                    if (str4.length() == 0) {
                        sb.append("\n");
                        this.previousLineBreak = true;
                        return;
                    }
                    return;
                }
                if (split.length > 0 && split[0].length() > 0 && ((Character.isLetterOrDigit(split[0].charAt(0)) || StringHelper.isBracket(split[0].charAt(0))) && i > 1 && !this.previousLineBreak)) {
                    sb.append(" ");
                }
                sb.append("<").append(split[1]).append(">");
                sb.append(split[0]);
                sb.append("</").append(split[1]).append(">");
                this.previousLineBreak = false;
            }
        });
        FileHelper.writeToFile(str2, sb);
    }

    public static void columnToBracket(String str, String str2, final String str3) {
        final StringBuilder sb = new StringBuilder();
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.extraction.entity.FileFormatParser.3
            String currentTag = Instance.NO_CATEGORY_DUMMY;

            public void performAction(String str4, int i) {
                String[] split = str4.split(str3);
                if (split.length < 2) {
                    return;
                }
                boolean z = false;
                if (!this.currentTag.equalsIgnoreCase(split[1])) {
                    if (!this.currentTag.equalsIgnoreCase("o") && i > 1) {
                        sb.append(" ]");
                    }
                    if (!split[1].equalsIgnoreCase("o")) {
                        if (i > 1) {
                            sb.append(" ");
                        }
                        sb.append("[").append(split[1]).append(" ");
                        z = true;
                    }
                }
                this.currentTag = split[1];
                if (Character.isLetterOrDigit(split[0].charAt(0)) && !z) {
                    sb.append(" ");
                }
                sb.append(split[0]);
            }
        });
        FileHelper.writeToFile(str2, sb.toString());
    }

    public static void columnToColumnBio(String str, String str2, final String str3) {
        final StringBuilder sb = new StringBuilder();
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.extraction.entity.FileFormatParser.4
            String lastTag = Instance.NO_CATEGORY_DUMMY;

            public void performAction(String str4, int i) {
                String[] split = str4.split(str3);
                if (split.length < 2) {
                    return;
                }
                int length = split.length - 1;
                String str5 = Instance.NO_CATEGORY_DUMMY;
                for (int i2 = 0; i2 < length; i2++) {
                    if (i2 > 0) {
                        str5 = str5 + str3;
                    }
                    str5 = str5 + split[i2];
                }
                String str6 = "O";
                if (!split[length].equalsIgnoreCase("o")) {
                    if (!this.lastTag.equalsIgnoreCase(split[length])) {
                        str6 = "B-" + split[length];
                    } else if (this.lastTag.equalsIgnoreCase(split[length])) {
                        str6 = "I-" + split[length];
                    }
                }
                this.lastTag = split[length];
                sb.append(str5).append(str3).append(str6).append("\n");
            }
        });
        FileHelper.writeToFile(str2, sb);
    }

    public static void columnBioToColumn(String str, String str2, final String str3) {
        final StringBuilder sb = new StringBuilder();
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.extraction.entity.FileFormatParser.5
            public void performAction(String str4, int i) {
                String[] split = str4.split(str3);
                if (split.length < 2) {
                    return;
                }
                sb.append(split[0]).append(str3).append(split[1].replaceFirst("B-", Instance.NO_CATEGORY_DUMMY).replaceFirst("I-", Instance.NO_CATEGORY_DUMMY)).append("\n");
            }
        });
        FileHelper.writeToFile(str2, sb.toString());
    }

    public static void xmlToColumn(String str, String str2, String str3) {
        Validate.notEmpty(str, "inputFilePath must not be empty", new Object[0]);
        Validate.notEmpty(str2, "outputFilePath must not be empty", new Object[0]);
        Validate.notEmpty(str3, "columnSeparator must not be empty", new Object[0]);
        BufferedWriter bufferedWriter = null;
        try {
            try {
                bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(str2), "UTF-8"));
                FileHelper.performActionOnEveryLine(str, new XmlToColumnAction(str3, bufferedWriter));
                FileHelper.close(new Closeable[]{bufferedWriter});
            } catch (IOException e) {
                throw new IllegalStateException("Could not write", e);
            }
        } catch (Throwable th) {
            FileHelper.close(new Closeable[]{bufferedWriter});
            throw th;
        }
    }

    public static String xmlToColumnText(String str, String str2) {
        Validate.notNull(str, "xmlText must not be null", new Object[0]);
        Validate.notEmpty(str2, "columnSeparator must not be empty", new Object[0]);
        StringWriter stringWriter = new StringWriter();
        XmlToColumnAction xmlToColumnAction = new XmlToColumnAction(str2, stringWriter);
        String[] split = str.split("\n");
        for (int i = 0; i < split.length; i++) {
            xmlToColumnAction.performAction(split[i], i);
        }
        return stringWriter.toString();
    }

    public static void slashToXml(String str, String str2) {
        slashToColumn(str, str2, "\t");
        columnToXml(str2, str2, "\t");
    }

    public static void slashToColumn(String str, String str2, String str3) {
        StringBuilder sb = new StringBuilder();
        Matcher matcher = Pattern.compile("(.+?)/([A-Z0-9_]{1,100}?)\\s", 32).matcher(FileHelper.tryReadFileToString(str));
        while (matcher.find()) {
            sb.append(matcher.group(1));
            sb.append(str3);
            sb.append(matcher.group(2));
            sb.append("\n");
        }
        FileHelper.writeToFile(str2, sb);
    }

    public static void columnToSlash(String str, String str2, String str3) {
        columnToSlash(str, str2, str3, "|");
    }

    public static void columnToSlash(String str, String str2, final String str3, final String str4) {
        final StringBuilder sb = new StringBuilder();
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.extraction.entity.FileFormatParser.6
            public void performAction(String str5, int i) {
                String[] split = str5.split(str3);
                if (split.length < 2) {
                    return;
                }
                sb.append(split[0]).append(str4).append(split[1]).append(" ");
            }
        });
        FileHelper.writeToFile(str2, sb);
    }

    public static void bracketToXml(String str, String str2) {
        FileHelper.writeToFile(str2, bracketToXmlText(FileHelper.tryReadFileToString(str)));
    }

    public static String bracketToXmlText(String str) {
        String str2 = str;
        Matcher matcher = Pattern.compile("\\[(\\w+)\\s([^]]+?)(\\s([^]]+?))*?\\s{0,2}\\]", 34).matcher(str);
        while (matcher.find()) {
            String trim = StringHelper.getSubstringBetween(matcher.group(0), "[", " ").trim();
            str2 = str2.replace(matcher.group(0), "<" + trim + ">" + StringHelper.getSubstringBetween(matcher.group(0), " ", "]").trim().trim() + "</" + trim + ">");
        }
        return str2;
    }

    public static void bracketToColumn(String str, String str2, String str3) {
        bracketToXml(str, str2);
        xmlToColumn(str2, str2, str3);
    }

    public static void columnTrainingToTest(String str, String str2, String str3) {
        FileHelper.writeToFile(str2, FileHelper.tryReadFileToString(str).replaceAll(str3, str3 + str3));
    }

    public static void removeWhiteSpaceInFirstColumn(String str, String str2, String str3) {
        FileHelper.writeToFile(str2, FileHelper.tryReadFileToString(str).replace(" ", str3));
    }

    public static void tsvToSsv(String str, String str2) {
        FileHelper.writeToFile(str2, FileHelper.tryReadFileToString(str).replaceAll("\\t", " "));
    }

    public static void textToColumn(String str, String str2, String str3) {
        List<String> list = Tokenizer.tokenize(FileHelper.tryReadFileToString(str));
        StringBuilder sb = new StringBuilder();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            sb.append(it.next()).append(str3).append("X").append("\n");
        }
        FileHelper.writeToFile(str2, sb);
    }

    public static Annotations<Annotation> getAnnotations(String str, TaggingFormat taggingFormat) {
        if (taggingFormat.equals(TaggingFormat.XML)) {
            return getAnnotationsFromXmlFile(str);
        }
        if (taggingFormat.equals(TaggingFormat.COLUMN)) {
            return getAnnotationsFromColumn(str);
        }
        if (taggingFormat.equals(TaggingFormat.BRACKETS)) {
            return getAnnotationsFromBrackets(str);
        }
        throw new IllegalArgumentException("Format " + taggingFormat + " not supported.");
    }

    private static Annotations<Annotation> getAnnotationsFromBrackets(String str) {
        String tempFile = getTempFile();
        bracketToXml(str, tempFile);
        return getAnnotationsFromXmlFile(tempFile);
    }

    public static Annotations<Annotation> getAnnotationsFromColumn(String str) {
        String tempFile = getTempFile();
        columnToXml(str, tempFile, "\t");
        return getAnnotationsFromXmlFile(tempFile);
    }

    public static Annotations<Annotation> getAnnotationsFromColumnTokenBased(String str) {
        String tempFile = getTempFile();
        columnToXmlTokenBased(str, tempFile, "\t");
        return getAnnotationsFromXmlFile(tempFile);
    }

    public static Annotations<Annotation> getAnnotationsFromXmlText(String str) {
        Annotations<Annotation> annotations = new Annotations<>();
        int i = 0;
        Matcher matcher = Pattern.compile("\\<([A-Z]+)\\>(.{1,1000}?)\\</\\1\\>", 34).matcher(str);
        while (matcher.find()) {
            String group = matcher.group(1);
            String group2 = matcher.group(2);
            int countTagLength = HtmlHelper.countTagLength(group2);
            String replaceAll = HtmlHelper.stripHtmlTags(group2).replaceAll("\n", Instance.NO_CATEGORY_DUMMY);
            int length = group.length() + 2;
            int i2 = i + length;
            annotations.add(new ImmutableAnnotation((matcher.start() + length) - i2, replaceAll, group));
            i = i2 + countTagLength + group.length() + 3;
        }
        return annotations;
    }

    public static Annotations<Annotation> getAnnotationsFromXmlFile(String str) {
        return getAnnotationsFromXmlText(FileHelper.tryReadFileToString(str));
    }

    public static Annotations<Annotation> getSeedAnnotations(String str, int i) {
        Annotations<Annotation> annotations = new Annotations<>();
        Bag bag = new Bag();
        HashSet hashSet = new HashSet();
        Iterator<T> it = getAnnotationsFromColumn(str).iterator();
        while (it.hasNext()) {
            Annotation annotation = (Annotation) it.next();
            String tag = annotation.getTag();
            if (bag.count(tag) < i || i == -1) {
                if (!hashSet.contains(annotation.getValue())) {
                    annotations.add(annotation);
                    hashSet.add(annotation.getValue());
                    bag.add(tag);
                }
            }
        }
        return annotations;
    }

    public static void main(String[] strArr) {
        CollectionHelper.print(getAnnotationsFromXmlText("asdfasdf <CITY role=\"main\">Dresden</CITY> asdfasdf asdf asdf <C>Berlin</C> asdfk <CITY>Berlin</CITY>"));
        System.exit(0);
        columnToXml("data/temp/columnFormat.tsv", "data/temp/xmlFormat.xml", "\\t");
        xmlToColumn("data/temp/xmlFormat.xml", "data/temp/columnFormat2.tsv", "\\t");
        xmlToColumn("data/temp/allTagged.xml", "data/temp/allTaggedColumn.tsv", "\\t");
        xmlToColumn("data/datasets/ner/mobilephone/text/all.xml", "data/datasets/ner/mobilephone/text/allColumn.tsv", "\t");
        columnTrainingToTest("data/temp/allColumn.tsv", "data/temp/allColumnTest.tsv", "\t");
        columnToColumnBio("data/temp/allColumn.tsv", "data/temp/allColumnBIO.tsv", "\t");
        columnToBracket("data/temp/allColumn.tsv", "data/temp/allBracket.tsv", "\t");
        bracketToXml("data/temp/allBracket.tsv", "data/temp/allXMLFromBracket.tsv");
        bracketToColumn("data/temp/allBracket.tsv", "data/temp/allColumnFromBracket.tsv", "\t");
        columnToXml("data/temp/allColumn.tsv", "data/temp/allXML.xml", "\t");
        xmlToColumn("data/temp/allXML.xml", "data/temp/allColumnFromXML.tsv", "\t");
        slashToXml("data/temp/slashedText.txt", "data/temp/xmlFromSlashed.xml");
        slashToColumn("data/temp/slashedText.txt", "data/temp/columnFromSlashed.tsv", "\t");
        CollectionHelper.print(getAnnotationsFromXmlFile("data/temp/xmlFromSlashed.xml"));
    }
}
