package uk.ac.shef.dcs.jate.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintWriter;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.lucene.analysis.jate.ComplexShingleFilter;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

/* loaded from: input_file:uk/ac/shef/dcs/jate/util/GENIACorpusParser.class */
public class GENIACorpusParser {
    public static void parse(String str, String str2) throws ParserConfigurationException, IOException, SAXException {
        NodeList elementsByTagName = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new FileInputStream(new File(str))).getDocumentElement().getElementsByTagName("article");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            Element element = (Element) elementsByTagName.item(i);
            PrintWriter printWriter = new PrintWriter(str2 + File.separator + (i + ComplexShingleFilter.DEFAULT_FILLER_TOKEN + element.getElementsByTagName("bibliomisc").item(0).getFirstChild().getNodeValue().replaceAll("[^0-9a-zA-Z]", ComplexShingleFilter.DEFAULT_FILLER_TOKEN)) + ".txt");
            NodeList elementsByTagName2 = element.getElementsByTagName("sentence");
            for (int i2 = 0; i2 < elementsByTagName2.getLength(); i2++) {
                printWriter.println(elementsByTagName2.item(i2).getTextContent());
            }
            printWriter.close();
        }
    }

    public static int countWordsInTerms(String str) throws ParserConfigurationException, IOException, SAXException {
        int i = 0;
        NodeList elementsByTagName = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new FileInputStream(new File(str))).getDocumentElement().getElementsByTagName("cons");
        for (int i2 = 0; i2 < elementsByTagName.getLength(); i2++) {
            i += elementsByTagName.item(i2).getTextContent().replaceAll("[^a-zA-Z0-9]", ComplexShingleFilter.DEFAULT_TOKEN_SEPARATOR).trim().split("\\s+").length;
        }
        return i;
    }

    public static int countWords(String str) throws ParserConfigurationException, IOException, SAXException {
        int i = 0;
        NodeList elementsByTagName = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new FileInputStream(new File(str))).getDocumentElement().getElementsByTagName("sentence");
        for (int i2 = 0; i2 < elementsByTagName.getLength(); i2++) {
            i += elementsByTagName.item(i2).getTextContent().replaceAll("[^a-zA-Z0-9]", ComplexShingleFilter.DEFAULT_TOKEN_SEPARATOR).trim().split("\\s+").length;
        }
        return i;
    }

    public static void main(String[] strArr) throws IOException, SAXException, ParserConfigurationException {
        System.out.println("words in GS terms:" + countWordsInTerms(strArr[0]));
        System.out.println("words in total:" + countWords(strArr[0]));
        parse(strArr[0], strArr[1]);
    }
}
