package uk.ac.shef.dcs.jate.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.lucene.analysis.jate.ComplexShingleFilter;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

/* loaded from: input_file:uk/ac/shef/dcs/jate/util/GENIACorpusParser.class */
public class GENIACorpusParser {
    public static Set<String> GENIA_GS_IGNORE = new HashSet();

    public static void parse(String str, String str2) throws ParserConfigurationException, IOException, SAXException {
        DocumentBuilder newDocumentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
        FileInputStream fileInputStream = new FileInputStream(new File(str));
        try {
            NodeList elementsByTagName = newDocumentBuilder.parse(fileInputStream).getDocumentElement().getElementsByTagName("article");
            for (int i = 0; i < elementsByTagName.getLength(); i++) {
                Element element = (Element) elementsByTagName.item(i);
                PrintWriter printWriter = new PrintWriter(str2 + File.separator + (i + ComplexShingleFilter.DEFAULT_FILLER_TOKEN + element.getElementsByTagName("bibliomisc").item(0).getFirstChild().getNodeValue().replaceAll("[^0-9a-zA-Z]", ComplexShingleFilter.DEFAULT_FILLER_TOKEN)) + ".txt");
                NodeList elementsByTagName2 = element.getElementsByTagName("sentence");
                for (int i2 = 0; i2 < elementsByTagName2.getLength(); i2++) {
                    Node item = elementsByTagName2.item(i2);
                    if (i2 == 0) {
                        printWriter.println(item.getTextContent());
                    } else {
                        printWriter.print(item.getTextContent() + ComplexShingleFilter.DEFAULT_TOKEN_SEPARATOR);
                    }
                }
                printWriter.println();
                printWriter.close();
            }
        } finally {
            fileInputStream.close();
        }
    }

    public static boolean ignore(String str) {
        Iterator<String> it = GENIA_GS_IGNORE.iterator();
        while (it.hasNext()) {
            if (str.contains(it.next())) {
                return true;
            }
        }
        return false;
    }

    public static void extractGoldstandardTerms(String str, String str2) throws ParserConfigurationException, IOException, SAXException {
        DocumentBuilder newDocumentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
        FileInputStream fileInputStream = new FileInputStream(new File(str));
        try {
            Document parse = newDocumentBuilder.parse(fileInputStream);
            PrintWriter printWriter = new PrintWriter(str2);
            NodeList elementsByTagName = parse.getDocumentElement().getElementsByTagName("cons");
            HashSet hashSet = new HashSet();
            for (int i = 0; i < elementsByTagName.getLength(); i++) {
                Element element = (Element) elementsByTagName.item(i);
                if (!ignore(element.getAttribute("lex"))) {
                    String trim = element.getTextContent().trim();
                    if (trim.length() > 2) {
                        hashSet.add(trim);
                    }
                }
            }
            ArrayList arrayList = new ArrayList(hashSet);
            Collections.sort(arrayList);
            Iterator it = arrayList.iterator();
            while (it.hasNext()) {
                printWriter.println((String) it.next());
            }
            printWriter.close();
            fileInputStream.close();
        } catch (Throwable th) {
            fileInputStream.close();
            throw th;
        }
    }

    public static int countWordsInTerms(String str) throws ParserConfigurationException, IOException, SAXException {
        DocumentBuilder newDocumentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
        FileInputStream fileInputStream = new FileInputStream(new File(str));
        try {
            int i = 0;
            NodeList elementsByTagName = newDocumentBuilder.parse(fileInputStream).getDocumentElement().getElementsByTagName("cons");
            for (int i2 = 0; i2 < elementsByTagName.getLength(); i2++) {
                i += elementsByTagName.item(i2).getTextContent().replaceAll("[^a-zA-Z0-9]", ComplexShingleFilter.DEFAULT_TOKEN_SEPARATOR).trim().split("\\s+").length;
            }
            return i;
        } finally {
            if (fileInputStream != null) {
                fileInputStream.close();
            }
        }
    }

    public static int countWords(String str) throws ParserConfigurationException, IOException, SAXException {
        DocumentBuilder newDocumentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
        FileInputStream fileInputStream = new FileInputStream(new File(str));
        try {
            int i = 0;
            NodeList elementsByTagName = newDocumentBuilder.parse(fileInputStream).getDocumentElement().getElementsByTagName("sentence");
            for (int i2 = 0; i2 < elementsByTagName.getLength(); i2++) {
                i += elementsByTagName.item(i2).getTextContent().replaceAll("[^a-zA-Z0-9]", ComplexShingleFilter.DEFAULT_TOKEN_SEPARATOR).trim().split("\\s+").length;
            }
            return i;
        } finally {
            fileInputStream.close();
        }
    }

    public static void main(String[] strArr) throws IOException, SAXException, ParserConfigurationException {
        extractGoldstandardTerms(strArr[0], strArr[1]);
    }

    static {
        GENIA_GS_IGNORE.add("*");
        GENIA_GS_IGNORE.add("(OR");
        GENIA_GS_IGNORE.add("(NOT");
        GENIA_GS_IGNORE.add("(TO");
        GENIA_GS_IGNORE.add("(THAN");
        GENIA_GS_IGNORE.add("(VERSUS");
        GENIA_GS_IGNORE.add("(AND");
        GENIA_GS_IGNORE.add("(BUT");
        GENIA_GS_IGNORE.add("(AS");
        GENIA_GS_IGNORE.add("(AND/OR");
    }
}
