package pl.edu.icm.cermine.bibref.parsing.tools;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.jdom.JDOMException;
import org.xml.sax.InputSource;
import pl.edu.icm.cermine.bibref.parsing.model.Citation;
import pl.edu.icm.cermine.bibref.parsing.model.CitationToken;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.2-SNAPSHOT.jar:pl/edu/icm/cermine/bibref/parsing/tools/MalletTrainingFileGenerator.class */
public final class MalletTrainingFileGenerator {
    private static String nlmDir = "/tmp/train/";
    private static String outFile = "/tmp/crf-train.txt";
    private static int minCount = 10;

    public static void main(String[] strArr) throws JDOMException, IOException {
        File file = new File(nlmDir);
        FileWriter fileWriter = null;
        try {
            fileWriter = new FileWriter(outFile);
            HashSet hashSet = new HashSet();
            HashMap hashMap = new HashMap();
            for (File file2 : file.listFiles()) {
                if (!file2.isDirectory() && file2.getName().endsWith(".nxml")) {
                    FileInputStream fileInputStream = null;
                    try {
                        fileInputStream = new FileInputStream(file2);
                        List<Citation> extractCitations = NlmCitationExtractor.extractCitations(new InputSource(fileInputStream));
                        if (fileInputStream != null) {
                            fileInputStream.close();
                        }
                        fileWriter.close();
                        for (Citation citation : extractCitations) {
                            hashSet.add(citation);
                            for (CitationToken citationToken : citation.getTokens()) {
                                if (citationToken.getText().matches("^[a-zA-Z]+$")) {
                                    String lowerCase = citationToken.getText().toLowerCase();
                                    if (hashMap.get(lowerCase) == null) {
                                        hashMap.put(lowerCase, 0);
                                    }
                                    hashMap.put(lowerCase, Integer.valueOf(((Integer) hashMap.get(lowerCase)).intValue() + 1));
                                }
                            }
                        }
                    } catch (Throwable th) {
                        if (fileInputStream != null) {
                            fileInputStream.close();
                        }
                        throw th;
                    }
                }
            }
            ArrayList<Map.Entry> arrayList = new ArrayList();
            Iterator it = hashMap.entrySet().iterator();
            while (it.hasNext()) {
                arrayList.add((Map.Entry) it.next());
            }
            Collections.sort(arrayList, new Comparator<Map.Entry<String, Integer>>() { // from class: pl.edu.icm.cermine.bibref.parsing.tools.MalletTrainingFileGenerator.1
                @Override // java.util.Comparator
                public int compare(Map.Entry<String, Integer> entry, Map.Entry<String, Integer> entry2) {
                    return entry.getValue().compareTo(entry2.getValue()) != 0 ? entry2.getValue().compareTo(entry.getValue()) : entry.getKey().compareTo(entry2.getKey());
                }
            });
            HashSet hashSet2 = new HashSet();
            for (Map.Entry entry : arrayList) {
                if (((Integer) entry.getValue()).intValue() > minCount) {
                    hashSet2.add(entry.getKey());
                }
            }
            Iterator it2 = hashSet.iterator();
            while (it2.hasNext()) {
                try {
                    Iterator<String> it3 = CitationUtils.citationToMalletInputFormat((Citation) it2.next()).iterator();
                    while (it3.hasNext()) {
                        fileWriter.write(it3.next());
                        fileWriter.write("\n");
                    }
                    fileWriter.write("\n");
                    fileWriter.close();
                } finally {
                    fileWriter.close();
                }
            }
            fileWriter.flush();
            if (fileWriter != null) {
                fileWriter.close();
            }
        } catch (Throwable th2) {
            if (fileWriter != null) {
                fileWriter.close();
            }
            throw th2;
        }
    }

    private MalletTrainingFileGenerator() {
    }
}
