package pl.edu.icm.cermine.bibref.parsing.tools;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.jdom.JDOMException;
import org.xml.sax.InputSource;
import pl.edu.icm.cermine.bibref.parsing.model.Citation;
import pl.edu.icm.cermine.bibref.parsing.model.CitationToken;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.2.jar:pl/edu/icm/cermine/bibref/parsing/tools/MalletTrainingFileGenerator.class */
public final class MalletTrainingFileGenerator {
    private static String nlmDir = "/home/domin/cermine-tests/out/";
    private static String outFile = "/tmp/crf-train.txt";
    private static String outFile2 = "/tmp/crf-train-words.txt";
    private static int minCount = 5;

    public static void main(String[] strArr) throws JDOMException, IOException {
        File file = new File(nlmDir);
        FileWriter fileWriter = null;
        try {
            ArrayList arrayList = new ArrayList();
            HashMap hashMap = new HashMap();
            for (File file2 : file.listFiles()) {
                if (!file2.isDirectory() && file2.getName().endsWith(".xml")) {
                    FileInputStream fileInputStream = null;
                    try {
                        fileInputStream = new FileInputStream(file2);
                        List<Citation> extractCitations = NlmCitationExtractor.extractCitations(new InputSource(fileInputStream));
                        if (fileInputStream != null) {
                            fileInputStream.close();
                        }
                        for (Citation citation : extractCitations.subList(0, 500)) {
                            arrayList.add(citation);
                            for (CitationToken citationToken : citation.getTokens()) {
                                if (citationToken.getText().matches("^[a-zA-Z]+$")) {
                                    String lowerCase = citationToken.getText().toLowerCase();
                                    if (hashMap.get(lowerCase) == null) {
                                        hashMap.put(lowerCase, 0);
                                    }
                                    hashMap.put(lowerCase, Integer.valueOf(((Integer) hashMap.get(lowerCase)).intValue() + 1));
                                }
                            }
                        }
                    } catch (Throwable th) {
                        if (fileInputStream != null) {
                            fileInputStream.close();
                        }
                        throw th;
                    }
                }
            }
            ArrayList<Map.Entry> arrayList2 = new ArrayList();
            Iterator it = hashMap.entrySet().iterator();
            while (it.hasNext()) {
                arrayList2.add((Map.Entry) it.next());
            }
            Collections.sort(arrayList2, new Comparator<Map.Entry<String, Integer>>() { // from class: pl.edu.icm.cermine.bibref.parsing.tools.MalletTrainingFileGenerator.1
                @Override // java.util.Comparator
                public int compare(Map.Entry<String, Integer> entry, Map.Entry<String, Integer> entry2) {
                    return entry.getValue().compareTo(entry2.getValue()) != 0 ? entry2.getValue().compareTo(entry.getValue()) : entry.getKey().compareTo(entry2.getKey());
                }
            });
            HashSet hashSet = new HashSet();
            for (Map.Entry entry : arrayList2) {
                if (((Integer) entry.getValue()).intValue() > minCount) {
                    hashSet.add(entry.getKey());
                }
            }
            fileWriter = new FileWriter(outFile);
            FileWriter fileWriter2 = new FileWriter(outFile2);
            Iterator it2 = hashSet.iterator();
            while (it2.hasNext()) {
                fileWriter2.write((String) it2.next());
                fileWriter2.write("\n");
            }
            fileWriter2.flush();
            fileWriter2.close();
            System.out.println(arrayList.size());
            int i = 0;
            Iterator it3 = arrayList.iterator();
            while (it3.hasNext()) {
                Iterator<String> it4 = CitationUtils.citationToMalletInputFormat((Citation) it3.next(), hashSet).iterator();
                while (it4.hasNext()) {
                    fileWriter.write(it4.next());
                    fileWriter.write("\n");
                }
                fileWriter.write("\n");
                i++;
            }
            fileWriter.flush();
            if (fileWriter != null) {
                fileWriter.close();
            }
        } catch (Throwable th2) {
            if (fileWriter != null) {
                fileWriter.close();
            }
            throw th2;
        }
    }

    private MalletTrainingFileGenerator() {
    }
}
