package uk.ac.shef.dcs.jate.io;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Files;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.SAXException;
import uk.ac.shef.dcs.jate.JATEException;

/* loaded from: input_file:uk/ac/shef/dcs/jate/io/ContentExtractor.class */
public class ContentExtractor {
    private Parser txtParser;
    private Tika tika;
    private int maxStringLength = 100000;

    public ContentExtractor() {
        this.txtParser = null;
        this.tika = null;
        this.txtParser = new TXTParser();
        this.tika = new Tika();
    }

    public String extractContent(URI uri) throws JATEException {
        return extractContent(new File(uri));
    }

    public String extractContent(File file) throws JATEException {
        if (file == null || !file.exists()) {
            throw new JATEException("File is not found!");
        }
        try {
            return MediaType.TEXT_PLAIN.getBaseType().toString().equals(Files.probeContentType(file.toPath())) ? parseTXTToString(file) : this.tika.parseToString(file);
        } catch (IOException e) {
            throw new JATEException("I/O exception when detecting file type.");
        } catch (TikaException e2) {
            throw new JATEException("Tika Content extraction exception: " + e2.toString());
        }
    }

    private String parseTXTToString(File file) throws IOException, TikaException {
        Metadata metadata = new Metadata();
        return parseTXTToString(TikaInputStream.get(file, metadata), metadata);
    }

    private String parseTXTToString(InputStream inputStream, Metadata metadata) throws IOException, TikaException {
        WriteOutContentHandler writeOutContentHandler = new WriteOutContentHandler(this.maxStringLength);
        try {
            try {
                ParseContext parseContext = new ParseContext();
                parseContext.set(Parser.class, this.txtParser);
                this.txtParser.parse(inputStream, new BodyContentHandler(writeOutContentHandler), metadata, parseContext);
                inputStream.close();
            } catch (SAXException e) {
                if (!writeOutContentHandler.isWriteLimitReached(e)) {
                    throw new TikaException("Unexpected SAX processing failure", e);
                }
                inputStream.close();
            }
            return writeOutContentHandler.toString();
        } catch (Throwable th) {
            inputStream.close();
            throw th;
        }
    }

    public static void main(String[] strArr) throws JATEException, URISyntaxException {
        System.out.println(new ContentExtractor().extractContent(new File("C:\\oak-project\\TermRecogniser\\evaluate\\lotus_notes\\ Workshop-QG9JVW.txt").toURI()));
    }
}
