package pl.edu.icm.cermine.bibref;

import com.google.common.collect.Lists;
import com.itextpdf.text.xml.xmp.PdfSchema;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.io.FileUtils;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.springframework.beans.propertyeditors.StringArrayPropertyEditor;
import org.xml.sax.InputSource;
import pl.edu.icm.cermine.ContentExtractor;
import pl.edu.icm.cermine.bibref.model.BibEntry;
import pl.edu.icm.cermine.bibref.model.BibEntryFieldType;
import pl.edu.icm.cermine.bibref.parsing.model.Citation;
import pl.edu.icm.cermine.bibref.parsing.model.CitationToken;
import pl.edu.icm.cermine.bibref.parsing.model.CitationTokenLabel;
import pl.edu.icm.cermine.bibref.parsing.tools.CitationUtils;
import pl.edu.icm.cermine.bibref.parsing.tools.NlmCitationExtractor;
import pl.edu.icm.cermine.bibref.transformers.BibEntryToNLMConverter;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;

/* loaded from: input_file:pl/edu/icm/cermine/bibref/CitationDatasetGenerator.class */
public final class CitationDatasetGenerator {
    private static final String OUT_NLM = "citations.nxml";
    private static final String OUT_BT = "citations.bibtex";
    private static final String OUT_TXT = "citations.txt";
    private static final int MAX_SET_SIZE = 100;
    private static final int TRIES_PER_FILE = 10;

    public static void main(String[] strArr) throws JDOMException, IOException, AnalysisException, TransformationException {
        if (strArr.length != 2) {
            System.out.println("Usage: PubMedToNLM <INPUT_DIR> <OUTPUT_DIR>");
            System.exit(1);
        }
        HashSet<BibEntry> hashSet = new HashSet();
        Collection<File> listFiles = FileUtils.listFiles(new File(strArr[0]), new String[]{PdfSchema.DEFAULT_XPATH_ID}, true);
        Random random = new Random(5268L);
        for (File file : listFiles) {
            if (hashSet.size() == 100) {
                break;
            }
            System.out.println("Processing: " + file.getPath());
            FileInputStream fileInputStream = new FileInputStream(file);
            ContentExtractor contentExtractor = new ContentExtractor();
            contentExtractor.setPDF(fileInputStream);
            List<BibEntry> references = contentExtractor.getReferences();
            List<Citation> extractCitations = NlmCitationExtractor.extractCitations(new InputSource(new FileInputStream(new File(file.getPath().replace(".pdf", ".nxml")))));
            HashSet hashSet2 = new HashSet();
            int i = 0;
            for (int i2 = 0; i2 < Math.min(extractCitations.size(), 10) && hashSet.size() != 100; i2++) {
                int nextInt = random.nextInt(extractCitations.size());
                if (!hashSet2.contains(Integer.valueOf(nextInt))) {
                    hashSet2.add(Integer.valueOf(nextInt));
                    Citation citation = extractCitations.get(nextInt);
                    String firstFieldValue = CitationUtils.citationToBibref(citation).getFirstFieldValue(BibEntryFieldType.TITLE);
                    if (firstFieldValue != null) {
                        String replaceAll = firstFieldValue.replaceAll("\\s+", HelpFormatter.DEFAULT_LONG_OPT_SEPARATOR);
                        Iterator<BibEntry> it = references.iterator();
                        while (it.hasNext()) {
                            String replaceAll2 = it.next().getText().replaceAll("\\s+", HelpFormatter.DEFAULT_LONG_OPT_SEPARATOR);
                            if (replaceAll2.contains(replaceAll)) {
                                Citation stringToCitation = CitationUtils.stringToCitation(replaceAll2);
                                ArrayList newArrayList = Lists.newArrayList(citation.getTokens());
                                for (CitationToken citationToken : stringToCitation.getTokens()) {
                                    citationToken.setLabel(CitationTokenLabel.TEXT);
                                    if (!citationToken.getText().equals(StringArrayPropertyEditor.DEFAULT_SEPARATOR) && !citationToken.getText().equals(".") && !citationToken.getText().equals(":") && !citationToken.getText().equals(";")) {
                                        CitationToken citationToken2 = null;
                                        Iterator it2 = newArrayList.iterator();
                                        while (true) {
                                            if (!it2.hasNext()) {
                                                break;
                                            }
                                            CitationToken citationToken3 = (CitationToken) it2.next();
                                            if (citationToken3.getText().equals(citationToken.getText())) {
                                                citationToken.setLabel(citationToken3.getLabel());
                                                citationToken2 = citationToken3;
                                                break;
                                            }
                                        }
                                        if (citationToken2 != null) {
                                            newArrayList.remove(citationToken2);
                                        }
                                    }
                                }
                                List<CitationToken> tokens = stringToCitation.getTokens();
                                for (int i3 = 1; i3 < tokens.size() - 1; i3++) {
                                    CitationToken citationToken4 = tokens.get(i3 - 1);
                                    CitationToken citationToken5 = tokens.get(i3);
                                    CitationToken citationToken6 = tokens.get(i3 + 1);
                                    if (citationToken5.getText().length() == 1 && citationToken4.getLabel().equals(citationToken6.getLabel()) && !CitationTokenLabel.TEXT.equals(citationToken6.getLabel())) {
                                        citationToken5.setLabel(citationToken4.getLabel());
                                    }
                                }
                                for (int i4 = 1; i4 < tokens.size() - 1; i4++) {
                                    CitationToken citationToken7 = tokens.get(i4 - 1);
                                    CitationToken citationToken8 = tokens.get(i4);
                                    if (citationToken8.getLabel().equals(CitationTokenLabel.ARTICLE_TITLE)) {
                                        break;
                                    }
                                    if (citationToken8.getText().matches("[A-Z]") || (citationToken7.getText().matches("[A-Z]") && citationToken8.getText().equals("."))) {
                                        citationToken8.setLabel(CitationTokenLabel.GIVENNAME);
                                    }
                                }
                                BibEntry citationToBibref = CitationUtils.citationToBibref(stringToCitation);
                                int i5 = 0;
                                int i6 = 0;
                                for (CitationToken citationToken9 : stringToCitation.getTokens()) {
                                    i6 += citationToken9.getText().length();
                                    if (citationToken9.getLabel().equals(CitationTokenLabel.TEXT)) {
                                        i5 += citationToken9.getText().length();
                                    }
                                }
                                if (i5 <= 0.25d * i6) {
                                    hashSet.add(citationToBibref);
                                    i++;
                                }
                            }
                        }
                    }
                }
            }
            System.out.println("Citations added: " + i);
        }
        File file2 = new File(strArr[1] + OUT_NLM);
        File file3 = new File(strArr[1] + OUT_BT);
        File file4 = new File(strArr[1] + OUT_TXT);
        BibEntryToNLMConverter bibEntryToNLMConverter = new BibEntryToNLMConverter();
        XMLOutputter xMLOutputter = new XMLOutputter(Format.getRawFormat());
        int i7 = 1;
        for (BibEntry bibEntry : hashSet) {
            Element convert = bibEntryToNLMConverter.convert(bibEntry, new Object[0]);
            int i8 = i7;
            i7++;
            convert.setAttribute("id", String.valueOf(i8));
            FileUtils.writeStringToFile(file2, xMLOutputter.outputString(convert), "UTF-8", true);
            FileUtils.writeStringToFile(file3, bibEntry.toBibTeX(), "UTF-8", true);
            FileUtils.writeStringToFile(file4, bibEntry.getText(), "UTF-8", true);
            FileUtils.writeStringToFile(file2, "\n", "UTF-8", true);
            FileUtils.writeStringToFile(file3, "\n", "UTF-8", true);
            FileUtils.writeStringToFile(file4, "\n", "UTF-8", true);
        }
    }

    private CitationDatasetGenerator() {
    }
}
