package pl.edu.icm.yadda.analysis.bibref.parsing.tools;

import groovy.ui.text.StructuredSyntaxHandler;
import info.aduna.xml.XMLReaderFactory;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Text;
import org.jdom.filter.Filter;
import org.jdom.input.SAXBuilder;
import org.xml.sax.InputSource;
import pl.edu.icm.yadda.analysis.bibref.BibEntry;
import pl.edu.icm.yadda.analysis.bibref.parsing.model.Citation;
import pl.edu.icm.yadda.analysis.bibref.parsing.model.CitationToken;
import pl.edu.icm.yadda.analysis.bibref.parsing.model.CitationTokenLabel;
import pl.edu.icm.yadda.imports.transformers.NlmToYConstants;
import pl.edu.icm.yadda.tools.mdi.MetadataIndexConstants;
import pl.edu.icm.yadda.ui.search.SimpleRequestCodec;

/* loaded from: input_file:WEB-INF/lib/yadda-analysis-impl-1.12.0.jar:pl/edu/icm/yadda/analysis/bibref/parsing/tools/NlmCitationExtractor.class */
public class NlmCitationExtractor {
    public static String TAG_CITATION = "mixed-citation";
    public static String KEY_TEXT = "text";
    public static List<String> EXP_TAGS = Arrays.asList("string-name", StructuredSyntaxHandler.ITALIC, StructuredSyntaxHandler.BOLD, "sup", "sub", "styled-content", "monospace", "sans-serif", StructuredSyntaxHandler.UNDERLINE, "xref", "inline-formula");
    private static final Map<String, CitationTokenLabel> TAGS_LABEL_MAP = new HashMap();

    public static Set<Citation> extractCitations(InputSource inputSource) throws JDOMException, IOException {
        Iterator descendants = new SAXBuilder(XMLReaderFactory.XERCES_SAXPARSER).build(inputSource).getDescendants(new Filter() { // from class: pl.edu.icm.yadda.analysis.bibref.parsing.tools.NlmCitationExtractor.1
            @Override // org.jdom.filter.Filter
            public boolean matches(Object obj) {
                return (obj instanceof Element) && ((Element) obj).getName().equals(NlmCitationExtractor.TAG_CITATION);
            }
        });
        HashSet hashSet = new HashSet();
        while (descendants.hasNext()) {
            Citation citation = new Citation();
            readElement((Element) descendants.next(), citation);
            hashSet.add(citation);
        }
        return hashSet;
    }

    private static void readElement(Element element, Citation citation) {
        for (Object obj : element.getContent()) {
            if (obj instanceof Text) {
                String text = ((Text) obj).getText();
                if (text.matches("^[\\s]*$")) {
                    citation.appendText(" ");
                } else {
                    for (CitationToken citationToken : CitationUtils.stringToCitation(text).getTokens()) {
                        citationToken.setStartIndex(citationToken.getStartIndex() + citation.getText().length());
                        citationToken.setEndIndex(citationToken.getEndIndex() + citation.getText().length());
                        citationToken.setLabel(TAGS_LABEL_MAP.get(KEY_TEXT));
                        citation.addToken(citationToken);
                    }
                    citation.appendText(text);
                }
            } else if (obj instanceof Element) {
                Element element2 = (Element) obj;
                String name = element2.getName();
                if (TAGS_LABEL_MAP.containsKey(name)) {
                    for (CitationToken citationToken2 : CitationUtils.stringToCitation(element2.getValue()).getTokens()) {
                        citationToken2.setStartIndex(citationToken2.getStartIndex() + citation.getText().length());
                        citationToken2.setEndIndex(citationToken2.getEndIndex() + citation.getText().length());
                        citationToken2.setLabel(TAGS_LABEL_MAP.get(name));
                        citation.addToken(citationToken2);
                    }
                    citation.appendText(element2.getValue());
                } else if (EXP_TAGS.contains(name)) {
                    readElement(element2, citation);
                }
            }
        }
    }

    static {
        TAGS_LABEL_MAP.put("article-title", CitationTokenLabel.ARTICLE_TITLE);
        TAGS_LABEL_MAP.put(NlmToYConstants.AT_CONF_NAME, CitationTokenLabel.CONF);
        TAGS_LABEL_MAP.put("named-content", CitationTokenLabel.CONTENT);
        TAGS_LABEL_MAP.put(BibEntry.FIELD_EDITION, CitationTokenLabel.EDITION);
        TAGS_LABEL_MAP.put("given-names", CitationTokenLabel.GIVENNAME);
        TAGS_LABEL_MAP.put(MetadataIndexConstants.F_ISSUE, CitationTokenLabel.ISSUE);
        TAGS_LABEL_MAP.put("fpage", CitationTokenLabel.PAGEF);
        TAGS_LABEL_MAP.put("lpage", CitationTokenLabel.PAGEL);
        TAGS_LABEL_MAP.put("publisher-loc", CitationTokenLabel.PUBLISHER_LOC);
        TAGS_LABEL_MAP.put("publisher-name", CitationTokenLabel.PUBLISHER_NAME);
        TAGS_LABEL_MAP.put(SimpleRequestCodec.FIELD_SCHEME, CitationTokenLabel.SC);
        TAGS_LABEL_MAP.put("series", CitationTokenLabel.SERIES);
        TAGS_LABEL_MAP.put("source", CitationTokenLabel.SOURCE);
        TAGS_LABEL_MAP.put("surname", CitationTokenLabel.SURNAME);
        TAGS_LABEL_MAP.put("text", CitationTokenLabel.TEXT);
        TAGS_LABEL_MAP.put("uri", CitationTokenLabel.URI);
        TAGS_LABEL_MAP.put("volume", CitationTokenLabel.VOLUME);
        TAGS_LABEL_MAP.put("volume-series", CitationTokenLabel.VOLUME_SERIES);
        TAGS_LABEL_MAP.put("year", CitationTokenLabel.YEAR);
    }
}
