package uk.ac.shef.dcs.jate.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.FileVisitOption;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.lucene.analysis.jate.ComplexShingleFilter;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.common.SolrInputDocument;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.JATEProperties;
import uk.ac.shef.dcs.jate.model.JATEDocument;

/* loaded from: input_file:uk/ac/shef/dcs/jate/util/JATEUtil.class */
public class JATEUtil {
    private static final Logger LOG = LoggerFactory.getLogger(JATEUtil.class);

    public static boolean isInteger(String str) {
        return isInteger(str, 10);
    }

    public static boolean isInteger(String str, int i) {
        if (str.isEmpty()) {
            return false;
        }
        for (int i2 = 0; i2 < str.length(); i2++) {
            if (i2 == 0 && str.charAt(i2) == '-') {
                if (str.length() == 1) {
                    return false;
                }
            } else if (Character.digit(str.charAt(i2), i) < 0) {
                return false;
            }
        }
        return true;
    }

    public static Map<Integer, Long> fileStatitics(Path path) throws IOException {
        Map<Integer, Long> map = (Map) ((Stream) Files.walk(path, new FileVisitOption[0]).parallel()).collect(Collectors.groupingBy(path2 -> {
            return Integer.valueOf(Files.isDirectory(path2, LinkOption.NOFOLLOW_LINKS) ? 1 : 2);
        }, Collectors.counting()));
        System.out.format("Files: %d, dirs: %d. ", map.get(2), map.get(1));
        return map;
    }

    public static JATEDocument loadACLRDTECDocument(InputStream inputStream) throws JATEException {
        return loadJATEDocFromXML(inputStream);
    }

    public static JATEDocument loadJATEDocument(Path path) throws JATEException {
        if (path.getFileName() == null) {
            return null;
        }
        JATEDocument jATEDocument = new JATEDocument(path.getFileName().toString());
        jATEDocument.setPath(path.toAbsolutePath().toString());
        FileInputStream fileInputStream = null;
        try {
            try {
                fileInputStream = new FileInputStream(path.toFile());
                jATEDocument.setContent(parseToPlainText(fileInputStream));
                if (fileInputStream != null) {
                    try {
                        fileInputStream.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
                return jATEDocument;
            } catch (FileNotFoundException e2) {
                throw new JATEException(String.format("File is not found from [%s]", path.toString()));
            }
        } catch (Throwable th) {
            if (fileInputStream != null) {
                try {
                    fileInputStream.close();
                } catch (IOException e3) {
                    e3.printStackTrace();
                }
            }
            throw th;
        }
    }

    public static JATEDocument loadACLRDTECDocumentFromRaw(File file) {
        JATEDocument jATEDocument = new JATEDocument(file.toURI());
        jATEDocument.setId(file.getName());
        StringBuilder sb = new StringBuilder();
        try {
            List readLines = FileUtils.readLines(file);
            if (readLines.size() > 0) {
                sb.getClass();
                readLines.forEach(sb::append);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        jATEDocument.setContent(sb.toString());
        return jATEDocument;
    }

    private static JATEDocument loadJATEDocFromXML(InputStream inputStream) throws JATEException {
        try {
            SAXParser newSAXParser = SAXParserFactory.newInstance().newSAXParser();
            final StringBuffer stringBuffer = new StringBuffer();
            final StringBuffer stringBuffer2 = new StringBuffer();
            final StringBuffer stringBuffer3 = new StringBuffer();
            newSAXParser.parse(inputStream, new DefaultHandler() { // from class: uk.ac.shef.dcs.jate.util.JATEUtil.1
                boolean paper = false;
                boolean title = false;
                boolean section = false;
                boolean sectionTitle = false;
                boolean paragraph = false;
                boolean reference = false;

                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
                    if (str3.equalsIgnoreCase("Paper")) {
                        this.paper = true;
                        stringBuffer2.append(attributes.getValue("id"));
                    }
                    if (str3.equalsIgnoreCase("title")) {
                        this.title = true;
                    }
                    if (str3.equalsIgnoreCase("Section")) {
                        this.section = true;
                    }
                    if (str3.equalsIgnoreCase("SectionTitle")) {
                        this.sectionTitle = true;
                    }
                    if (str3.equalsIgnoreCase("Paragraph")) {
                        this.paragraph = true;
                    }
                    if (str3.equalsIgnoreCase("Reference")) {
                        this.reference = true;
                    }
                }

                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void endElement(String str, String str2, String str3) throws SAXException {
                    if (str3.equalsIgnoreCase("Paragraph")) {
                        this.paragraph = false;
                    }
                }

                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void characters(char[] cArr, int i, int i2) throws SAXException {
                    if (this.paper) {
                        this.paper = false;
                    }
                    if (this.title) {
                        this.title = false;
                        if (!this.reference) {
                            stringBuffer3.append(new String(cArr, i, i2)).append("\n");
                        }
                        this.reference = false;
                    }
                    if (this.section) {
                        this.section = false;
                    }
                    if (this.sectionTitle) {
                        this.sectionTitle = false;
                    }
                    if (this.paragraph) {
                        stringBuffer.append(new String(cArr, i, i2));
                    }
                }
            });
            StringBuffer stringBuffer4 = new StringBuffer();
            stringBuffer4.append(stringBuffer3).append("\n").append(stringBuffer);
            String unescapeHtml = StringEscapeUtils.unescapeHtml(StringEscapeUtils.unescapeHtml(cleanText(StringEscapeUtils.unescapeXml(Normalizer.normalize(stringBuffer4.toString(), Normalizer.Form.NFD)))));
            JATEDocument jATEDocument = new JATEDocument(stringBuffer2.toString());
            jATEDocument.setContent(unescapeHtml.trim());
            return jATEDocument;
        } catch (IOException e) {
            throw new JATEException("I/O Exception when parsing input file!" + e.toString());
        } catch (ParserConfigurationException e2) {
            throw new JATEException("Failed to initialise SAXParser!" + e2.toString());
        } catch (SAXException e3) {
            throw new JATEException("Failed to initialise SAXParser!" + e3.toString());
        }
    }

    public static String cleanText(String str) {
        String str2 = str;
        Iterator<String> it = extractBrokenWords(str).iterator();
        while (it.hasNext()) {
            str2 = fixBrokenWords(str2, it.next());
        }
        return str2;
    }

    public static List<String> extractBrokenWords(String str) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = Pattern.compile("([A-Z]\\s([a-z]\\s){3,10})").matcher(str);
        while (matcher.find()) {
            arrayList.add(matcher.group());
        }
        Matcher matcher2 = Pattern.compile("([A-Z]\\s([A-Z]\\s){3,10})").matcher(str);
        while (matcher2.find()) {
            arrayList.add(matcher2.group());
        }
        return arrayList;
    }

    public static String fixBrokenWords(String str, String str2) {
        return str.replaceAll(str2, str2.replaceAll(ComplexShingleFilter.DEFAULT_TOKEN_SEPARATOR, "").concat(ComplexShingleFilter.DEFAULT_TOKEN_SEPARATOR));
    }

    public static List<Path> loadFiles(Path path) throws JATEException {
        try {
            ArrayList arrayList = new ArrayList();
            ((Stream) Files.walk(path, new FileVisitOption[0]).parallel()).forEach(path2 -> {
                if (Files.isDirectory(path2, LinkOption.NOFOLLOW_LINKS)) {
                    return;
                }
                arrayList.add(path2);
            });
            return arrayList;
        } catch (IOException e) {
            throw new JATEException(String.format("Failed to access corpus path [%s]", path.toUri()));
        }
    }

    public static String parseToPlainText(InputStream inputStream) {
        BodyContentHandler bodyContentHandler = new BodyContentHandler();
        String str = "";
        try {
            new AutoDetectParser().parse(inputStream, bodyContentHandler, new Metadata());
            str = bodyContentHandler.toString();
        } catch (IOException | SAXException | TikaException e) {
            LOG.debug("Parsing Exception while extracting content from current file. " + e.toString());
        }
        return str;
    }

    public static void addNewDoc(EmbeddedSolrServer embeddedSolrServer, String str, String str2, String str3, JATEProperties jATEProperties, boolean z) throws IOException, SolrServerException, JATEException {
        SolrInputDocument solrInputDocument = new SolrInputDocument(new String[0]);
        solrInputDocument.addField("id", str);
        solrInputDocument.addField("title_s", str2);
        solrInputDocument.addField("text", str3);
        solrInputDocument.addField(jATEProperties.getSolrFieldNameJATENGramInfo(), str3);
        solrInputDocument.addField(jATEProperties.getSolrFieldNameJATECTerms(), str3);
        embeddedSolrServer.add(solrInputDocument);
        if (z) {
            embeddedSolrServer.commit();
        }
    }
}
