package pl.edu.icm.cermine.metadata.affiliation;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.jdom.Element;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import pl.edu.icm.cermine.bibref.model.BibEntry;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.metadata.affiliation.tools.AffiliationCRFTokenClassifier;
import pl.edu.icm.cermine.metadata.affiliation.tools.AffiliationFeatureExtractor;
import pl.edu.icm.cermine.metadata.affiliation.tools.AffiliationTokenizer;
import pl.edu.icm.cermine.metadata.model.AffiliationLabel;
import pl.edu.icm.cermine.metadata.model.DocumentAffiliation;
import pl.edu.icm.cermine.metadata.transformers.DocumentMetadataToNLMElementConverter;
import pl.edu.icm.cermine.parsing.model.Token;
import pl.edu.icm.cermine.parsing.tools.ParsableStringParser;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.5-SNAPSHOT.jar:pl/edu/icm/cermine/metadata/affiliation/CRFAffiliationParser.class */
public class CRFAffiliationParser implements ParsableStringParser<DocumentAffiliation> {
    private AffiliationTokenizer tokenizer;
    private AffiliationFeatureExtractor featureExtractor;
    private AffiliationCRFTokenClassifier classifier;
    private static final String DEFAULT_MODEL_FILE = "/pl/edu/icm/cermine/metadata/affiliation/acrf-affiliations-pubmed.ser.gz";
    private static final String DEFAULT_COMMON_WORDS_FILE = "/pl/edu/icm/cermine/metadata/affiliation/common-words-affiliations-pubmed.txt";

    private List<String> loadWords(String str) throws AnalysisException {
        ArrayList arrayList = new ArrayList();
        InputStream resourceAsStream = getClass().getResourceAsStream(str);
        if (resourceAsStream == null) {
            throw new AnalysisException("Resource not found: " + str);
        }
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(resourceAsStream));
        while (true) {
            try {
                try {
                    String readLine = bufferedReader.readLine();
                    if (readLine == null) {
                        try {
                            bufferedReader.close();
                            return arrayList;
                        } catch (IOException e) {
                            throw new AnalysisException("An exception occured when the stream was being closed: " + e);
                        }
                    }
                    arrayList.add(readLine);
                } catch (Throwable th) {
                    try {
                        bufferedReader.close();
                        throw th;
                    } catch (IOException e2) {
                        throw new AnalysisException("An exception occured when the stream was being closed: " + e2);
                    }
                }
            } catch (IOException e3) {
                throw new AnalysisException("An exception occured when the common word list " + str + " was being read: " + e3);
            }
        }
    }

    public CRFAffiliationParser(String str, String str2) throws AnalysisException {
        this.tokenizer = null;
        this.featureExtractor = null;
        this.classifier = null;
        List<String> loadWords = loadWords(str);
        this.tokenizer = new AffiliationTokenizer();
        this.featureExtractor = new AffiliationFeatureExtractor(loadWords);
        this.classifier = new AffiliationCRFTokenClassifier(getClass().getResourceAsStream(str2));
    }

    public CRFAffiliationParser() throws AnalysisException {
        this(DEFAULT_COMMON_WORDS_FILE, DEFAULT_MODEL_FILE);
    }

    @Override // pl.edu.icm.cermine.parsing.tools.ParsableStringParser
    public void parse(DocumentAffiliation documentAffiliation) throws AnalysisException {
        documentAffiliation.setTokens(this.tokenizer.tokenize(documentAffiliation.getRawText()));
        Iterator<Token<AffiliationLabel>> it = documentAffiliation.getTokens().iterator();
        while (it.hasNext()) {
            it.next().setLabel(AffiliationLabel.TEXT);
        }
        this.featureExtractor.calculateFeatures(documentAffiliation);
        this.classifier.classify(documentAffiliation.getTokens());
        documentAffiliation.mergeTokens();
    }

    @Override // pl.edu.icm.cermine.parsing.tools.ParsableStringParser
    public Element parse(String str) throws AnalysisException, TransformationException {
        DocumentAffiliation documentAffiliation = new DocumentAffiliation(str);
        parse(documentAffiliation);
        return new DocumentMetadataToNLMElementConverter().convertAffiliation(documentAffiliation);
    }

    public static void main(String[] strArr) throws ParseException, AnalysisException, TransformationException {
        Options options = new Options();
        options.addOption(BibEntry.FIELD_AFFILIATION, true, "reference text");
        String optionValue = new GnuParser().parse(options, strArr).getOptionValue(BibEntry.FIELD_AFFILIATION);
        if (optionValue == null) {
            System.err.println("Usage: CRFAffiliationParser -affiliation <affiliation text>\n\nTool for extracting metadata from affiliation strings.\n\nArguments:\n  -affiliation            the text of the affiliation\n");
            System.exit(1);
        }
        Element parse = new CRFAffiliationParser().parse(optionValue);
        System.out.println(new XMLOutputter(Format.getPrettyFormat()).outputString(parse));
    }
}
