package pl.edu.icm.cermine.metadata.affiliation;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.jdom.Element;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import pl.edu.icm.cermine.bibref.model.BibEntry;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.metadata.affiliation.tools.AffiliationCRFTokenClassifier;
import pl.edu.icm.cermine.metadata.affiliation.tools.AffiliationFeatureExtractor;
import pl.edu.icm.cermine.metadata.affiliation.tools.AffiliationTokenizer;
import pl.edu.icm.cermine.metadata.model.AffiliationLabel;
import pl.edu.icm.cermine.metadata.model.DocumentAffiliation;
import pl.edu.icm.cermine.metadata.transformers.MetadataToNLMConverter;
import pl.edu.icm.cermine.parsing.model.Token;
import pl.edu.icm.cermine.parsing.tools.ParsableStringParser;
import pl.edu.icm.cermine.tools.ResourcesReader;

/* loaded from: input_file:pl/edu/icm/cermine/metadata/affiliation/CRFAffiliationParser.class */
public class CRFAffiliationParser implements ParsableStringParser<DocumentAffiliation> {
    private static final int MAX_LENGTH = 3000;
    private AffiliationTokenizer tokenizer;
    private AffiliationFeatureExtractor featureExtractor;
    private AffiliationCRFTokenClassifier classifier;
    private static final String DEFAULT_MODEL_FILE = "/pl/edu/icm/cermine/metadata/affiliation/acrf-affiliations-pubmed.ser.gz";
    private static final String DEFAULT_COMMON_WORDS_FILE = "/pl/edu/icm/cermine/metadata/affiliation/common-words-affiliations-pubmed.txt";
    private static final String STATES_FILE = "/pl/edu/icm/cermine/metadata/affiliation/features/states.txt";
    private static final String STATE_CODES_FILE = "/pl/edu/icm/cermine/metadata/affiliation/features/state_codes.txt";

    private List<String> loadWords(String str) throws AnalysisException {
        BufferedReader bufferedReader = null;
        try {
            try {
                ArrayList arrayList = new ArrayList();
                InputStream resourceAsStream = CRFAffiliationParser.class.getResourceAsStream(str);
                if (resourceAsStream == null) {
                    throw new AnalysisException("Resource not found: " + str);
                }
                BufferedReader bufferedReader2 = new BufferedReader(new InputStreamReader(resourceAsStream, "UTF-8"));
                while (true) {
                    String readLine = bufferedReader2.readLine();
                    if (readLine == null) {
                        break;
                    }
                    arrayList.add(readLine);
                }
                if (bufferedReader2 != null) {
                    try {
                        bufferedReader2.close();
                    } catch (IOException e) {
                        Logger.getLogger(CRFAffiliationParser.class.getName()).log(Level.SEVERE, (String) null, (Throwable) e);
                    }
                }
                return arrayList;
            } catch (IOException e2) {
                throw new AnalysisException("An exception occured when the common word list " + str + " was being read: " + e2);
            }
        } catch (Throwable th) {
            if (0 != 0) {
                try {
                    bufferedReader.close();
                } catch (IOException e3) {
                    Logger.getLogger(CRFAffiliationParser.class.getName()).log(Level.SEVERE, (String) null, (Throwable) e3);
                    throw th;
                }
            }
            throw th;
        }
    }

    public CRFAffiliationParser(String str, String str2) throws AnalysisException {
        this.tokenizer = null;
        this.featureExtractor = null;
        this.classifier = null;
        List<String> loadWords = loadWords(str);
        this.tokenizer = new AffiliationTokenizer();
        this.featureExtractor = new AffiliationFeatureExtractor(loadWords);
        this.classifier = new AffiliationCRFTokenClassifier(CRFAffiliationParser.class.getResourceAsStream(str2));
    }

    public CRFAffiliationParser() throws AnalysisException {
        this(DEFAULT_COMMON_WORDS_FILE, DEFAULT_MODEL_FILE);
    }

    @Override // pl.edu.icm.cermine.parsing.tools.ParsableStringParser
    public void parse(DocumentAffiliation documentAffiliation) throws AnalysisException {
        documentAffiliation.setTokens(this.tokenizer.tokenize(documentAffiliation.getRawText()));
        Iterator<Token<AffiliationLabel>> it = documentAffiliation.getTokens().iterator();
        while (it.hasNext()) {
            it.next().setLabel(AffiliationLabel.TEXT);
        }
        if (documentAffiliation.getRawText().length() > MAX_LENGTH) {
            documentAffiliation.mergeTokens();
            return;
        }
        this.featureExtractor.calculateFeatures(documentAffiliation);
        this.classifier.classify(documentAffiliation.getTokens());
        documentAffiliation.mergeTokens();
        if (documentAffiliation.getCountry() == null) {
            try {
                List<String> readLinesAsList = ResourcesReader.readLinesAsList(STATES_FILE, ResourcesReader.TRIM_TRANSFORMER);
                List<String> readLinesAsList2 = ResourcesReader.readLinesAsList(STATE_CODES_FILE, ResourcesReader.TRIM_TRANSFORMER);
                boolean z = false;
                Iterator<String> it2 = readLinesAsList.iterator();
                while (it2.hasNext()) {
                    if (Pattern.compile("(?<![0-9a-zA-Z])" + it2.next() + "(?![0-9a-zA-Z])").matcher(documentAffiliation.getRawText()).find()) {
                        z = true;
                    }
                }
                Iterator<String> it3 = readLinesAsList2.iterator();
                while (it3.hasNext()) {
                    if (Pattern.compile("(?<![0-9a-zA-Z])" + it3.next() + "(?![0-9a-zA-Z])").matcher(documentAffiliation.getRawText().substring(Math.max(0, documentAffiliation.getRawText().length() - 20))).find()) {
                        z = true;
                    }
                }
                if (z) {
                    int length = documentAffiliation.getRawText().length();
                    documentAffiliation.setRawText(documentAffiliation.getRawText() + ", USA");
                    documentAffiliation.addToken(new Token<>(",", length, length + 1, AffiliationLabel.TEXT));
                    documentAffiliation.addToken(new Token<>("USA", length + 3, length + 6, AffiliationLabel.COUN));
                }
            } catch (TransformationException e) {
            }
        }
    }

    @Override // pl.edu.icm.cermine.parsing.tools.ParsableStringParser
    public Element parse(String str) throws AnalysisException, TransformationException {
        DocumentAffiliation documentAffiliation = new DocumentAffiliation(str);
        parse(documentAffiliation);
        return new MetadataToNLMConverter().convertAffiliation(documentAffiliation);
    }

    public static void main(String[] strArr) throws ParseException, AnalysisException, TransformationException {
        Options options = new Options();
        options.addOption(BibEntry.FIELD_AFFILIATION, true, "reference text");
        String optionValue = new DefaultParser().parse(options, strArr).getOptionValue(BibEntry.FIELD_AFFILIATION);
        if (optionValue == null) {
            System.err.println("Usage: CRFAffiliationParser -affiliation <affiliation text>\n\nTool for extracting metadata from affiliation strings.\n\nArguments:\n  -affiliation            the text of the affiliation\n");
            System.exit(1);
        }
        Element parse = new CRFAffiliationParser().parse(optionValue);
        System.out.println(new XMLOutputter(Format.getPrettyFormat()).outputString(parse));
    }
}
