package pl.edu.icm.cermine.metadata.affiliation.tools;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.xalan.templates.Constants;
import org.jdom.JDOMException;
import org.xml.sax.InputSource;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.metadata.model.AffiliationLabel;
import pl.edu.icm.cermine.metadata.model.DocumentAffiliation;
import pl.edu.icm.cermine.parsing.model.Token;
import pl.edu.icm.cermine.parsing.tools.GrmmUtils;
import pl.edu.icm.cermine.tools.TextUtils;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.8.jar:pl/edu/icm/cermine/metadata/affiliation/tools/AffiliationTrainingDataExporter.class */
public class AffiliationTrainingDataExporter {
    private static final AffiliationTokenizer tokenizer = new AffiliationTokenizer();
    private static AffiliationFeatureExtractor featureExtractor = null;
    private static final String DEFAULT_INPUT = "affiliations/javatests/affs-real-like.xml";
    private static final String DEFAULT_OUTPUT = "affiliations/javatests/features-actual-xml.txt";
    private static final String DEFAULT_WORDS = "affiliations/javatests/words-actual-xml.txt";
    private static final int DEFAULT_NEIGHBOR_THRESHOLD = 1;
    private static final int DEFAULT_RARE_THRESHOLD = 25;
    private static final String DEFAULT_INPUT_TYPE = "xml";

    private static void writeAffiliation(DocumentAffiliation documentAffiliation, PrintWriter printWriter, int i) {
        printWriter.write(GrmmUtils.toGrmmInput(documentAffiliation.getTokens(), i));
        printWriter.write("\n");
    }

    private static void writeCommonWords(List<String> list, PrintWriter printWriter) {
        Collections.sort(list);
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            printWriter.write(it.next() + '\n');
        }
    }

    private static void addMockAffiliation(PrintWriter printWriter) {
        printWriter.write("TEXT ---- text\n\n");
    }

    private static List<String> getCommonWordsFromAffs(List<DocumentAffiliation> list, int i) {
        HashMap hashMap = new HashMap();
        ArrayList arrayList = new ArrayList();
        Iterator<DocumentAffiliation> it = list.iterator();
        while (it.hasNext()) {
            Iterator<Token<AffiliationLabel>> it2 = it.next().getTokens().iterator();
            while (it2.hasNext()) {
                String text = it2.next().getText();
                if (TextUtils.isWord(text)) {
                    hashMap.put(text, Integer.valueOf((hashMap.containsKey(text) ? ((Integer) hashMap.get(text)).intValue() : 0) + 1));
                }
            }
        }
        for (Map.Entry entry : hashMap.entrySet()) {
            if (((Integer) entry.getValue()).intValue() > i) {
                arrayList.add(entry.getKey());
            }
        }
        return arrayList;
    }

    private static List<String> loadCommonWords(BufferedReader bufferedReader) throws IOException {
        ArrayList arrayList = new ArrayList();
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                return arrayList;
            }
            arrayList.add(readLine);
        }
    }

    private static List<DocumentAffiliation> loadAffiliationsFromTxt(BufferedReader bufferedReader) throws IOException {
        ArrayList arrayList = new ArrayList();
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                return arrayList;
            }
            DocumentAffiliation documentAffiliation = new DocumentAffiliation(readLine);
            documentAffiliation.setTokens(tokenizer.tokenize(documentAffiliation.getRawText()));
            arrayList.add(documentAffiliation);
        }
    }

    public static void main(String[] strArr) throws AnalysisException, ParseException, JDOMException {
        List<DocumentAffiliation> extractStrings;
        Options options = new Options();
        options.addOption("input", true, "input file (raw strings)");
        options.addOption(Constants.ELEMNAME_OUTPUT_STRING, true, "output file (GRMM format)");
        options.addOption("common_words", true, "file with common (not-rare) words to generate");
        options.addOption("neighbor", true, "neighbor influence threshold");
        options.addOption("rare", true, "rare threshold");
        options.addOption("input_type", true, "xml or txt");
        options.addOption("add_mock_text", false, "should add TEXT");
        options.addOption("load_words", false, "read common words from file instead of writing them");
        CommandLine parse = new GnuParser().parse(options, strArr);
        String optionValue = parse.getOptionValue("input");
        String optionValue2 = parse.getOptionValue(Constants.ELEMNAME_OUTPUT_STRING);
        String optionValue3 = parse.getOptionValue("common_words");
        String optionValue4 = parse.getOptionValue("neighbor");
        String optionValue5 = parse.getOptionValue("rare");
        String optionValue6 = parse.getOptionValue("input_type");
        boolean z = parse.hasOption("add_mock_text");
        boolean z2 = parse.hasOption("load_words");
        if (optionValue == null) {
            optionValue = DEFAULT_INPUT;
        }
        if (optionValue2 == null) {
            optionValue2 = DEFAULT_OUTPUT;
        }
        if (optionValue3 == null) {
            optionValue3 = DEFAULT_WORDS;
        }
        int parseInt = optionValue4 != null ? Integer.parseInt(optionValue4) : 1;
        int parseInt2 = optionValue5 != null ? Integer.parseInt(optionValue5) : 25;
        if (optionValue6 == null) {
            optionValue6 = "xml";
        }
        File file = new File(optionValue);
        BufferedReader bufferedReader = null;
        BufferedReader bufferedReader2 = null;
        PrintWriter printWriter = null;
        PrintWriter printWriter2 = null;
        NLMAffiliationExtractor nLMAffiliationExtractor = new NLMAffiliationExtractor();
        try {
            PrintWriter printWriter3 = new PrintWriter(optionValue2, "UTF-8");
            if (z2) {
                bufferedReader2 = new BufferedReader(new FileReader(optionValue3));
            } else {
                printWriter2 = new PrintWriter(optionValue3, "UTF-8");
            }
            if (optionValue6.equals("txt")) {
                bufferedReader = new BufferedReader(new FileReader(file));
                extractStrings = loadAffiliationsFromTxt(bufferedReader);
            } else {
                if (!optionValue6.equals("xml")) {
                    throw new ParseException("Unknown input type: " + optionValue6);
                }
                extractStrings = nLMAffiliationExtractor.extractStrings(new InputSource(new FileInputStream(file)));
            }
            List<String> loadCommonWords = z2 ? loadCommonWords(bufferedReader2) : getCommonWordsFromAffs(extractStrings, parseInt2);
            featureExtractor = new AffiliationFeatureExtractor(loadCommonWords);
            for (DocumentAffiliation documentAffiliation : extractStrings) {
                featureExtractor.calculateFeatures(documentAffiliation);
                writeAffiliation(documentAffiliation, printWriter3, parseInt);
            }
            if (z) {
                addMockAffiliation(printWriter3);
            }
            if (!z2) {
                writeCommonWords(loadCommonWords, printWriter2);
            }
            if (bufferedReader != null) {
                try {
                    bufferedReader.close();
                } catch (IOException e) {
                    throw new RuntimeException("Can't close resources!");
                }
            }
            if (bufferedReader2 != null) {
                bufferedReader2.close();
            }
            if (printWriter3 != null) {
                printWriter3.close();
            }
            if (printWriter2 != null) {
                printWriter2.close();
            }
        } catch (FileNotFoundException e2) {
            if (0 != 0) {
                try {
                    bufferedReader.close();
                } catch (IOException e3) {
                    throw new RuntimeException("Can't close resources!");
                }
            }
            if (0 != 0) {
                bufferedReader2.close();
            }
            if (0 != 0) {
                printWriter.close();
            }
            if (0 != 0) {
                printWriter2.close();
            }
        } catch (IOException e4) {
            if (0 != 0) {
                try {
                    bufferedReader.close();
                } catch (IOException e5) {
                    throw new RuntimeException("Can't close resources!");
                }
            }
            if (0 != 0) {
                bufferedReader2.close();
            }
            if (0 != 0) {
                printWriter.close();
            }
            if (0 != 0) {
                printWriter2.close();
            }
        } catch (Throwable th) {
            if (0 != 0) {
                try {
                    bufferedReader.close();
                } catch (IOException e6) {
                    throw new RuntimeException("Can't close resources!");
                }
            }
            if (0 != 0) {
                bufferedReader2.close();
            }
            if (0 != 0) {
                printWriter.close();
            }
            if (0 != 0) {
                printWriter2.close();
            }
            throw th;
        }
    }
}
