package uk.ac.cam.ch.wwmm.acpgeo;

import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import nu.xom.Builder;
import nu.xom.Document;
import nu.xom.Nodes;
import nu.xom.ParsingException;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;
import uk.ac.cam.ch.wwmm.chemicaltagger.Utils;

/* loaded from: input_file:uk/ac/cam/ch/wwmm/acpgeo/AbstractReader.class */
public class AbstractReader {
    private Document xmlDoc;
    private String abstractString;
    private Nodes references;
    private String titleString;
    private String acronymString;
    private String citationString;
    private Nodes authors;
    private Nodes affiliations;
    private Nodes titleNode;
    private Nodes year;
    private Nodes articleURL;

    public Document getXmlDoc() {
        return this.xmlDoc;
    }

    public String getAbstractString() {
        return this.abstractString;
    }

    public String getAcronymPhrases() {
        return this.acronymString;
    }

    public String getCitations() {
        return this.citationString;
    }

    public Nodes getReferences() {
        return this.references;
    }

    public String getTitleString() {
        return this.titleString;
    }

    public Nodes getAuthors() {
        return this.authors;
    }

    public Nodes getArticleURL() {
        return this.articleURL;
    }

    public Nodes getYear() {
        return this.year;
    }

    public Nodes getTitleNode() {
        return this.titleNode;
    }

    public Nodes getAffiliations() {
        return this.affiliations;
    }

    public AbstractReader(InputStream inputStream) {
        try {
            try {
                XMLReader createXMLReader = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
                createXMLReader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
                this.xmlDoc = new Builder(createXMLReader).build(inputStream);
            } catch (SAXException e) {
                throw new RuntimeException(e);
            }
        } catch (ParsingException e2) {
            e2.printStackTrace();
        } catch (IOException e3) {
            e3.printStackTrace();
        }
        loadDocument();
    }

    public AbstractReader() {
    }

    private void loadDocument() {
        this.abstractString = Utils.cleanHTMLText(this.xmlDoc.query("//abstract").get(0).toXML());
        this.acronymString = getAcronymPhrases(this.abstractString);
        this.citationString = getCitations(this.abstractString);
        this.abstractString = removeNBS(this.abstractString);
        this.abstractString = forceStop(this.abstractString);
        this.abstractString = setAcronymPhrases(this.abstractString);
        this.titleNode = this.xmlDoc.query("//article_title");
        this.titleString = Utils.cleanHTMLText(this.xmlDoc.query("//article_title").get(0).getValue());
        this.references = this.xmlDoc.query("//reference");
        this.authors = this.xmlDoc.query("//authors");
        this.affiliations = this.xmlDoc.query("//affiliations");
        this.articleURL = this.xmlDoc.query("//article_url");
        this.year = this.xmlDoc.query("//publication_year");
    }

    public String removeNBS(CharSequence charSequence) {
        Matcher matcher = Pattern.compile("U+00A0").matcher(charSequence);
        StringBuffer stringBuffer = new StringBuffer(charSequence.length());
        while (matcher.find()) {
            matcher.appendReplacement(stringBuffer, Matcher.quoteReplacement(" "));
            System.out.println("found NBS" + matcher.group(0));
        }
        matcher.appendTail(stringBuffer);
        return stringBuffer.toString();
    }

    public String getCitations(CharSequence charSequence) {
        String str;
        removeNBS(charSequence);
        Matcher matcher = Pattern.compile("(([.][a-z][.]\\s+)|([^.]\\s+)|([^ A-Za-z]))((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)\\s+)?(\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*\\s+)(((et\\s+al[.])|(and))\\s*((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)\\s+)?\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*\\s*){0,1})?(([(]\\d{4,4}[a-z]?(([,;]|(\\s*and))\\s*(\\d{4,4})?[a-z]?)*[)])|(,\\s*\\d{4,4}[a-z]?(([,;]|\\s*(and))\\s*(\\d{4,4})[a-z]?)*)))", 128).matcher(charSequence);
        if (matcher.find()) {
            System.out.println("+++++++++++++++++++found preserveCitationAll.group(0)++++++++++++++++++" + matcher.group(0));
            System.out.println("+++++++++++++++++++found preserveCitationAll.group(5)++++++++++++++++++" + matcher.group(5));
            str = " " + matcher.group(5).toString() + " . ";
            while (matcher.find()) {
                str = str + matcher.group(5).toString() + " . ";
                System.out.println("+++++++++++++++++++found preserveCitationAll.group(0)++++++++++++++++++" + matcher.group(0));
                System.out.println("+++++++++++++++++++found preserveCitationAll.group(5)++++++++++++++++++" + matcher.group(5));
            }
        } else {
            str = "EMPTY";
        }
        return str;
    }

    public String getAcronymPhrases(CharSequence charSequence) {
        String str;
        Matcher matcher = Pattern.compile("((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-/]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*)\\s+((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-/]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*)\\s+|((and|in|for)\\s+)){1,8}([0-9]{1,4}\\s+)?[(][ ]?[A-Z]+[-A-Za-z/]+[-A-Za-z0-9/]*[ ]?[)])", 128).matcher(charSequence);
        StringBuffer stringBuffer = new StringBuffer(charSequence.length());
        if (matcher.find()) {
            matcher.appendReplacement(stringBuffer, Matcher.quoteReplacement(matcher.group(0)));
            String str2 = " " + matcher.group(0).toString() + " . ";
            while (true) {
                str = str2;
                if (!matcher.find()) {
                    break;
                }
                matcher.appendReplacement(stringBuffer, Matcher.quoteReplacement(matcher.group(0)));
                str2 = str + matcher.group(0).toString() + " . ";
            }
        } else {
            str = "EMPTY";
        }
        return str;
    }

    public String setAcronymPhrases(CharSequence charSequence) {
        Pattern compile = Pattern.compile("((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-/]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*)\\s+((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-/]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*)\\s+|((and|in|for)\\s+)){1,8}([0-9]{1,4}\\s+)?[(][ ]?[A-Z]+[-A-Za-z/]+[-A-Za-z0-9/]*[ ]?[)])", 128);
        Pattern compile2 = Pattern.compile("((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-/]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*)\\s+((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-/]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*)\\s+|((and|in|for)\\s+)){1,8}([0-9]{1,4}\\s+)?[(][ ]?[A-Z]+[-A-Za-z/]+[-A-Za-z0-9/]*[ ]?[)])|(([.][a-z][.]\\s+)|([^.]\\s+)|([^ A-Za-z]))((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)\\s+)?(\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*\\s+)(((et\\s+al[.])|(and))\\s*((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)\\s+)?\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*\\s*){0,1})?(([(]\\d{4,4}[a-z]?(([,;]|(\\s*and))\\s*(\\d{4,4})?[a-z]?)*[)])|(,\\s*\\d{4,4}[a-z]?(([,;]|\\s*(and))\\s*(\\d{4,4})[a-z]?)*)))", 128);
        Pattern compile3 = Pattern.compile("((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-/]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*)\\s+((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-/]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*)\\s+|((and|in|for)\\s+)){1,8}([0-9]{1,4}\\s+)?[(][ ]?[A-Z]+[-A-Za-z/]+[-A-Za-z0-9/]*[ ]?[)])|((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)\\s+)?(\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*\\s+)(((et\\s+al[.])|(and))\\s*((\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)\\s+)?\\p{Lu}\\p{M}*(\\p{Ll}\\p{M}*)+[-]?(\\p{Lu}\\p{M}*)?(\\p{Ll}\\p{M}*)*\\s*){0,1})?(([(]\\d{4,4}[a-z]?(([,;]|(\\s*and))\\s*(\\d{4,4})?[a-z]?)*[)])|(,\\s*\\d{4,4}[a-z]?(([,;]|\\s*(and))\\s*(\\d{4,4})[a-z]?)*)))", 128);
        Matcher matcher = compile2.matcher(charSequence);
        Matcher matcher2 = compile3.matcher(charSequence);
        StringBuffer stringBuffer = new StringBuffer(charSequence.length());
        while (matcher.find()) {
            while (matcher2.find()) {
                matcher2.appendReplacement(stringBuffer, Matcher.quoteReplacement(replaceText(Boolean.valueOf(compile.matcher(matcher2.group()).matches()))));
                System.out.println("+++++++++++++++++++REPLACEMENT GROUP++++++++++++++++++" + matcher2.group());
            }
        }
        matcher2.appendTail(stringBuffer);
        return stringBuffer.toString();
    }

    public String replaceText(Boolean bool) {
        return bool.booleanValue() ? "_phraseACR_" : "_citatION_";
    }

    public String forceStop(CharSequence charSequence) {
        HashSet hashSet = new HashSet(Arrays.asList("Prof.", "e.g.", "Fig.", "fig.", "i.e.", "vol.", "aq.", "e.g.:", "eq.", "St.", "Mt.", "equiv.", "conc.", "anh.", "sat.", "lit.", "dil.", "sol.", "liq.", "Cal.", "cal."));
        Matcher matcher = Pattern.compile("(((\\s+)(\\p{L}+))([.]\\s+\\p{Lu}))", 128).matcher(charSequence);
        StringBuffer stringBuffer = new StringBuffer(charSequence.length());
        while (matcher.find()) {
            if (!hashSet.contains(matcher.group(4) + '.')) {
                matcher.appendReplacement(stringBuffer, Matcher.quoteReplacement(matcher.group(2) + ' ' + matcher.group(5)));
            }
        }
        matcher.appendTail(stringBuffer);
        return stringBuffer.toString();
    }
}
