package pl.edu.icm.sedno.mein;

import com.google.common.base.Joiner;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.MarkerFactory;
import pl.edu.icm.sedno.converter.ConverterUtils;
import pl.edu.icm.sedno.converter.YModelToolbox;
import pl.edu.icm.sedno.exception.SednoSystemException;
import pl.edu.icm.sedno.mein.model.Author;
import pl.edu.icm.sedno.mein.model.Book;
import pl.edu.icm.sedno.mein.model.Institution;
import pl.edu.icm.sedno.mein.model.Publication;
import pl.edu.icm.sedno.mein.model.Work;
import pl.edu.icm.sedno.model.dict.StandardSourceSystem;
import pl.edu.icm.yadda.bwmeta.model.YAffiliation;
import pl.edu.icm.yadda.bwmeta.model.YAncestor;
import pl.edu.icm.yadda.bwmeta.model.YAttribute;
import pl.edu.icm.yadda.bwmeta.model.YContributor;
import pl.edu.icm.yadda.bwmeta.model.YDate;
import pl.edu.icm.yadda.bwmeta.model.YElement;
import pl.edu.icm.yadda.bwmeta.model.YExportable;
import pl.edu.icm.yadda.bwmeta.model.YLanguage;
import pl.edu.icm.yadda.bwmeta.transformers.YToBwmeta2_1Transformer;
import pl.edu.icm.yadda.metadata.transformers.TransformationException;

/* loaded from: input_file:pl/edu/icm/sedno/mein/BWmetaConverter.class */
public class BWmetaConverter {
    private static final String EXT_KEJN_ID_PREFIX = "http://sedno.ceon.pl/-/element/kejn-import/";
    private static final String COMMON_NAMES_RESOURCE = "/common-names.txt";
    private String subsetId;
    private Map<Integer, Institution> institutions;
    private Map<String, Set<Integer>> affiliations = new HashMap();
    private int multiHits;
    private int singleHits;
    private int cluesUsed;
    private int worksWithAffiliations;
    private static final String NAME_SPLIT_RE = "(?<!\\.)[,;\\s]+|(?<=\\.)[,;\\s]*+(?!-?\\p{Lu}\\.)";
    private static final String NAME_PREPOSITIONS_RE = "(?<!\\p{L})(du|van|von|de|da|der|den)(?!\\p{L})";
    private static final String INITIALS_RE = "(\\p{L}\\.[ -]?)+\\p{L}?";
    private static final String INITIAL_BRACKET_RE = "\\p{L}\\[\\p{L}+\\]";
    private static Logger logger = LoggerFactory.getLogger(BWmetaConverter.class);
    private static final YModelToolbox y = new YModelToolbox();
    private static final Set<String> FIRST_NAMES = new HashSet();

    /* JADX INFO: Access modifiers changed from: package-private */
    /* renamed from: pl.edu.icm.sedno.mein.BWmetaConverter$1Check, reason: invalid class name */
    /* loaded from: input_file:pl/edu/icm/sedno/mein/BWmetaConverter$1Check.class */
    public final class C1Check {
        List<String> parts = Collections.emptyList();
        private int wrong = Integer.MAX_VALUE;

        C1Check() {
        }

        public void tokens(List<String> list) {
            if (list.isEmpty()) {
                return;
            }
            int i = 0;
            Iterator<String> it = list.iterator();
            while (it.hasNext()) {
                int size = BWmetaConverter.tokenize(BWmetaConverter.NAME_SPLIT_RE, it.next().replaceAll(BWmetaConverter.NAME_PREPOSITIONS_RE, "")).size();
                if (size < 2 || size > 3) {
                    i += size;
                }
            }
            if (i < this.wrong) {
                this.parts = list;
                this.wrong = i;
            }
        }
    }

    public String getSubsetId() {
        return this.subsetId;
    }

    public void setSubsetId(String str) {
        this.subsetId = str;
    }

    public void setInstitutions(Map<Integer, Institution> map) {
        this.institutions = map;
    }

    public ByteArrayOutputStream format(List<Publication> list) {
        ArrayList arrayList = new ArrayList();
        Iterator<Publication> it = list.iterator();
        while (it.hasNext()) {
            arrayList.add(convert(it.next()));
        }
        return transform(arrayList);
    }

    private ByteArrayOutputStream transform(List<YExportable> list) {
        YToBwmeta2_1Transformer yToBwmeta2_1Transformer = new YToBwmeta2_1Transformer();
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        try {
            byteArrayOutputStream.write(yToBwmeta2_1Transformer.write(list, new Object[0]).getBytes());
        } catch (IOException e) {
            e.printStackTrace();
        } catch (TransformationException e2) {
            e2.printStackTrace();
        }
        return byteArrayOutputStream;
    }

    public ByteArrayOutputStream formatBooks(List<Book> list) {
        ArrayList arrayList = new ArrayList();
        for (Book book : list) {
            if (StringUtils.isBlank(book.getTytul())) {
                logger.warn("Skipping book with empty title, id={}", Integer.valueOf(book.getId()));
            } else {
                arrayList.add(convert(book));
            }
        }
        return transform(arrayList);
    }

    public void logAffiliationResultsStats() {
        logger.info("Got {} hits with a single identifier.", Integer.valueOf(this.singleHits));
        logger.info("Got {} hits with multiple identifiers (unused).", Integer.valueOf(this.multiHits));
        logger.info("Got {} affiliations from non-count clues.", Integer.valueOf(this.cluesUsed));
        logger.info("Got {} works with at least one affiliation.", Integer.valueOf(this.worksWithAffiliations));
    }

    private static String generateWorkId(int i, String str) {
        return str + "/" + i;
    }

    private YElement convertWork(Work work, String str, String str2) {
        String generateWorkId = generateWorkId(work.getId(), str);
        YElement yElement = new YElement(EXT_KEJN_ID_PREFIX + generateWorkId);
        yElement.addId(y.id("http://sedno.ceon.pl/-/id-scheme/kejn-import", generateWorkId));
        yElement.addName(y.canonicalName(YLanguage.Undetermined, ConverterUtils.ignoreInvalidChars(str2)));
        yElement.addDate(convertDate(work.getRok()));
        List<Author> parsedAuthors = work.getParsedAuthors();
        if (!parsedAuthors.isEmpty()) {
            HashSet hashSet = new HashSet();
            for (Author author : parsedAuthors) {
                String fullName = author.getFullName();
                if (fullName.replaceAll(NAME_PREPOSITIONS_RE, "").trim().replaceAll("(?<=\\P{L}\\p{L}\\.)\\s+(?=\\p{L}\\.)", "").split("\\s+").length > 3) {
                    logger.warn(MarkerFactory.getMarker("PARSING"), "\"{}\", overlong author name in: {}", fullName, str2);
                }
                YContributor yContributor = new YContributor("author", false);
                if (author.getGuess() == Author.Guess.FromClues) {
                    this.cluesUsed++;
                }
                if (author.getGuess().isPositive()) {
                    String num = Integer.toString(work.getIdJednostki());
                    yContributor.addAffiliationRef(num);
                    hashSet.add(affiliation(work, num));
                } else {
                    Set<Integer> set = this.affiliations.get(genKey(author));
                    if (set != null) {
                        if (set.size() == 1) {
                            this.singleHits++;
                            Integer next = set.iterator().next();
                            Institution institution = this.institutions.get(next);
                            String num2 = Integer.toString(next.intValue());
                            yContributor.addAffiliationRef(num2);
                            hashSet.add(((YAffiliation) new YAffiliation(num2, institution.getName()).addAttribute((YAttribute) y.attribute("http://yadda.icm.edu.pl/-/bwmeta/terms#id", Integer.valueOf(institution.getId())).addAttribute(y.attribute("http://sedno.ceon.pl/-/sedno/terms#SourceSystem", StandardSourceSystem.KEJN_SURVEY.getItem())))).addAttribute((YAttribute) y.attribute("http://yadda.icm.edu.pl/-/bwmeta/terms#id", Integer.valueOf(institution.getNPId())).addAttribute(y.attribute("http://sedno.ceon.pl/-/sedno/terms#SourceSystem", StandardSourceSystem.NAUKA_POLSKA.getItem()))));
                        } else {
                            this.multiHits++;
                        }
                    }
                }
                saveNames(yContributor, author);
                yElement.addContributor(yContributor);
            }
            if (!hashSet.isEmpty()) {
                this.worksWithAffiliations++;
            }
            hashSet.add(affiliation(work, Integer.toString(work.getIdJednostki())));
            yElement.setAffiliations(hashSet);
        }
        return yElement;
    }

    private static YAffiliation affiliation(Work work, String str) {
        return (YAffiliation) ((YAffiliation) new YAffiliation(str, work.getNazwaJednostki()).addAttribute((YAttribute) y.attribute("http://yadda.icm.edu.pl/-/bwmeta/terms#id", Integer.valueOf(work.getIdJednostki())).addAttribute(y.attribute("http://sedno.ceon.pl/-/sedno/terms#SourceSystem", StandardSourceSystem.KEJN_SURVEY.getItem())))).addAttribute((YAttribute) y.attribute("http://yadda.icm.edu.pl/-/bwmeta/terms#id", Integer.valueOf(work.getInstitutionNPId())).addAttribute(y.attribute("http://sedno.ceon.pl/-/sedno/terms#SourceSystem", StandardSourceSystem.NAUKA_POLSKA.getItem())));
    }

    public void parseAuthors(List<? extends Work> list) {
        for (Work work : list) {
            List<Author> splitAuthors = splitAuthors(work.getAutorzy(), work.getLiczbaAutorow(), work.getLiczbaAutorowJednostki());
            if (splitAuthors.size() == 1 && splitAuthors.get(0).getGuess() == Author.Guess.NoneSoFar) {
                splitAuthors.get(0).setGuess(Author.Guess.SingleConribution);
                logger.debug("A single contribution with no other clues: {}", splitAuthors.get(0));
            }
            work.setParsedAuthors(splitAuthors);
            for (Author author : splitAuthors) {
                if (author.getGuess() == Author.Guess.FromCounts || author.getGuess() == Author.Guess.SingleConribution) {
                    String genKey = genKey(author);
                    Set<Integer> set = this.affiliations.get(genKey);
                    if (set == null) {
                        set = new HashSet();
                    }
                    set.add(Integer.valueOf(work.getIdJednostki()));
                    this.affiliations.put(genKey, set);
                }
            }
        }
        logAffiliationMapStats();
    }

    private void logAffiliationMapStats() {
        int i = 0;
        int i2 = 0;
        Iterator<Set<Integer>> it = this.affiliations.values().iterator();
        while (it.hasNext()) {
            if (it.next().size() == 1) {
                i++;
            } else {
                i2++;
            }
        }
        logger.info("Got {} keys for single affiliations.", Integer.valueOf(i));
        logger.info("Got {} keys for multiple affiliations.", Integer.valueOf(i2));
    }

    private static String genKey(Author author) {
        return (author.getGiven() + author.getFamily()).toLowerCase().replaceAll("\\P{L}", "");
    }

    private static Iterator<String> skipped(int i, Iterator<String> it) {
        Iterators.skip(it, i);
        return it;
    }

    private static Set<Author> author(String str, Author.Guess guess) {
        String str2;
        String str3 = null;
        Joiner on = Joiner.on(' ');
        String replaceAll = str.replaceAll("[\\s,;]+$", "");
        if (guess == Author.Guess.NoneSoFar && (replaceAll.endsWith("*") || !replaceAll.matches(".*\\p{Ll}.*"))) {
            guess = Author.Guess.FromClues;
        }
        List<String> list = tokenize(NAME_SPLIT_RE, replaceAll);
        ListIterator<String> listIterator = list.listIterator();
        while (listIterator.hasNext()) {
            listIterator.set(listIterator.next().replaceAll("^[^\\p{L}]+", "").replaceAll("[^\\p{L}.]+$", ""));
        }
        if (list.isEmpty()) {
            return Collections.emptySet();
        }
        String str4 = list.get(0);
        String join = on.join(skipped(1, list.iterator()));
        int size = list.size() - 1;
        String join2 = on.join(Iterators.limit(list.iterator(), list.size() - 1));
        String str5 = list.get(size);
        if (list.size() == 1) {
            str2 = str4;
        } else if (str4.matches(INITIAL_BRACKET_RE)) {
            str3 = str4.replaceAll("[\\]\\[]", "");
            str2 = join;
        } else if (str5.matches(INITIAL_BRACKET_RE)) {
            str3 = str5.replaceAll("[\\]\\[]", "");
            str2 = join2;
        } else if (str4.matches("(?i)(?<!\\p{L})(du|van|von|de|da|der|den)(?!\\p{L})")) {
            str3 = str5;
            str2 = join2;
        } else if (list.get(size).matches("(?i)(?<!\\p{L})(du|van|von|de|da|der|den)(?!\\p{L})")) {
            str3 = list.get(size - 1);
            str2 = str5 + " " + on.join(Iterators.limit(list.iterator(), list.size() - 2));
        } else if (replaceAll.matches(".*(?<!\\p{L})(du|van|von|de|da|der|den)(?!\\p{L}).*")) {
            str3 = str4;
            str2 = join;
        } else if (str5.matches(INITIALS_RE)) {
            str3 = join;
            str2 = str4;
        } else if (str4.matches(INITIALS_RE)) {
            str3 = join2;
            str2 = str5;
        } else if (FIRST_NAMES.contains(str5.toLowerCase())) {
            str3 = join;
            str2 = str4;
        } else if (FIRST_NAMES.contains(str4.toLowerCase())) {
            str3 = join2;
            str2 = str5;
        } else if (replaceAll.contains(",")) {
            String[] split = replaceAll.split(",", 2);
            str3 = split[1];
            str2 = split[0];
        } else {
            str3 = join;
            str2 = str4;
        }
        return Collections.singleton(new Author(str3, str2, guess));
    }

    private static List<Author> splitAuthors(String str, int i, float f) {
        if (str == null) {
            return Collections.emptyList();
        }
        int ceil = (int) Math.ceil(f);
        boolean z = i >= ceil && i > 1;
        String normalize = normalize(str);
        String[] split = normalize.split(",\\s*…\\s*,");
        if (split.length > 1) {
            return splitSubgroups(split, ceil);
        }
        if (z) {
            List<String> splitExpected = splitExpected(normalize, i);
            if (!splitExpected.isEmpty()) {
                return mapAuthors(splitExpected, ceil);
            }
        }
        return mapAuthors(splitUnknown(normalize), ceil);
    }

    private static List<Author> mapAuthors(List<String> list, int i) {
        Author.Guess guess = i == list.size() ? Author.Guess.FromCounts : Author.Guess.NoneSoFar;
        ArrayList arrayList = new ArrayList(list.size());
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            arrayList.addAll(author(it.next(), guess));
        }
        validateGuess(arrayList, i);
        return arrayList;
    }

    private static void validateGuess(List<Author> list, int i) {
        int i2 = 0;
        Iterator<Author> it = list.iterator();
        while (it.hasNext()) {
            if (it.next().getGuess().isPositive()) {
                i2++;
            }
        }
        if (i2 > i) {
            for (Author author : list) {
                if (author.getGuess() == Author.Guess.FromClues) {
                    author.setGuess(Author.Guess.None);
                }
            }
        }
    }

    private static List<Author> splitSubgroups(String[] strArr, int i) {
        ArrayList arrayList = new ArrayList();
        for (int i2 = 0; i2 < strArr.length; i2++) {
            Author.Guess guess = Author.Guess.None;
            List<String> list = tokenize(",", strArr[i2]);
            if (i2 == strArr.length - 1 && list.size() == i) {
                guess = Author.Guess.FromClues;
            }
            Iterator<String> it = list.iterator();
            while (it.hasNext()) {
                arrayList.addAll(author(it.next(), guess));
            }
        }
        return arrayList;
    }

    private static List<String> splitUnknown(String str) {
        C1Check c1Check = new C1Check();
        Iterator it = Arrays.asList("\t", ";", ",", "[,;]").iterator();
        while (it.hasNext()) {
            c1Check.tokens(tokenize((String) it.next(), str));
        }
        Iterator it2 = Arrays.asList("\\s+", "(?<=[\\s;,]++)").iterator();
        while (it2.hasNext()) {
            c1Check.tokens(pair(tokenize((String) it2.next(), str)));
        }
        return c1Check.parts;
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static List<String> tokenize(String str, String str2) {
        String[] split = str2.split(str);
        for (int i = 0; i < split.length; i++) {
            split[i] = split[i].trim();
        }
        return Lists.newArrayList(split);
    }

    private static List<String> splitExpected(String str, int i) {
        Iterator it = Arrays.asList("\\s+", "(?<=[\\s;,]++)").iterator();
        while (it.hasNext()) {
            List<String> pair = pair(tokenize((String) it.next(), str));
            if (pair.size() == i) {
                return pair;
            }
        }
        if (!str.contains(";") && !str.contains(",")) {
            List<String> list = tokenize("\t+", str);
            if (list.size() == i) {
                return list;
            }
        }
        Iterator it2 = Arrays.asList(";", ",", "[;,]").iterator();
        while (it2.hasNext()) {
            List<String> list2 = tokenize((String) it2.next(), str);
            if (list2.size() == i) {
                return list2;
            }
        }
        return Collections.emptyList();
    }

    private static List<String> pair(List<String> list) {
        if (list.size() % 2 != 0) {
            return Collections.emptyList();
        }
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            arrayList.add(it.next() + " " + it.next());
        }
        return arrayList;
    }

    private static String normalize(String str) {
        String replaceAll = str.replaceAll("\\n", " ").replaceAll("(.*?)\\s*\\(m\\. ?inn?\\.(.*)\\).*", "$1, ..., $2").replaceAll("(?<!\\p{L})\\[[^]]*\\]", " ").replaceAll("\\([^)]*\\)", " ").replaceAll("(?i)(?<!\\p{L})(adiunkt|adi(\\.? ii st)?|prof(\\.? (nad)?zw(yczajny)?)?|dr|hab|farm|mgr|inż|arch|lek[., ]+med)([;,. ]+|$)", "").replaceAll("(?<!\\p{L})(et[. ]*all?|i[. ]+wsp|i[. ]+(in(ni?)?|zespół))\\P{L}*", "").replaceAll("\\s*\\p{Pd}\\s*", "-").replaceAll("(?<=[\\s-]\\p{Lu}( \\p{Ll}){0,20})\\s(?=\\p{Ll}(?!\\p{L}))", "").replaceAll("\\.~(\\p{Lu}\\.)", ".$1").replaceAll("\\.~", ". ").replaceAll("(?<=\\p{L})\\s*…+", ", …").replaceAll("([,;])[….,; ]*(\\.\\.\\.|…)[….,; ]*", "$1 …$1 ").replaceAll("(?:^|[\\s&&[^\t]]*([,;\\s]++))([^\\p{L}…&]*([,;\\s]|$))?", "$1").replaceAll("[;,]\\P{L}*$", "");
        return replaceAll.replaceAll("[\\s;,]+(i|AND|and|&)[\\s;,]+", replaceAll.contains(";") ? "; " : ", ").trim();
    }

    private YElement convert(Publication publication) {
        logger.debug("Converting publication: {}", publication);
        YElement convertWork = convertWork(publication, "articles:" + this.subsetId, publication.getTytul());
        convertWork.addStructure(y.structure("bwmeta1.hierarchy-class.hierarchy_Journal", "bwmeta1.level.hierarchy_Journal_Article", publication.getPages(), y.ancestor("bwmeta1.level.hierarchy_Journal_Journal", y.canonicalName(YLanguage.Undetermined, publication.getCzasopismo()), y.id("http://sedno.ceon.pl/-/id-scheme/journal.ministry-id", publication.getCzasopismoMein()), new YContributor[0]), y.ancestor("bwmeta1.level.hierarchy_Journal_Volume", y.canonicalName(YLanguage.Undetermined, publication.getVolume()), new YContributor[0])));
        return convertWork;
    }

    private YElement convert(Book book) {
        logger.debug("Converting book: {}", book);
        boolean isNotBlank = StringUtils.isNotBlank(book.getTytulRozdzialu());
        boolean isNotBlank2 = StringUtils.isNotBlank(book.getVolume());
        YElement convertWork = convertWork(book, "books:" + this.subsetId, isNotBlank ? book.getTytulRozdzialu() : isNotBlank2 ? book.getVolume() : book.getTytul());
        YContributor contributor = y.contributor("publisher", true, y.canonicalName(YLanguage.Undetermined, book.getWydawca()), new YAttribute[0]);
        YModelToolbox yModelToolbox = y;
        String str = isNotBlank ? "bwmeta1.level.hierarchy_Book_Chapter" : isNotBlank2 ? "bwmeta1.level.hierarchy_Book_Part" : "bwmeta1.level.hierarchy_Book_Book";
        String pages = book.getPages();
        YAncestor[] yAncestorArr = new YAncestor[2];
        yAncestorArr[0] = (YAncestor) ConverterUtils.when(isNotBlank || isNotBlank2, y.ancestor("bwmeta1.level.hierarchy_Book_Book", y.canonicalName(YLanguage.Undetermined, book.getTytul()), contributor));
        yAncestorArr[1] = (YAncestor) ConverterUtils.when(isNotBlank && isNotBlank2, y.ancestor("bwmeta1.level.hierarchy_Book_Part", y.canonicalName(YLanguage.Undetermined, book.getVolume()), new YContributor[0]));
        convertWork.addStructure(yModelToolbox.structure("bwmeta1.hierarchy-class.hierarchy_Book", str, pages, yAncestorArr));
        if (!isNotBlank && !isNotBlank2) {
            convertWork.addContributor(contributor);
        }
        return convertWork;
    }

    private static void saveNames(YContributor yContributor, Author author) {
        String trimToEmpty = StringUtils.trimToEmpty(author.getGiven());
        String trimToEmpty2 = StringUtils.trimToEmpty(author.getFamily());
        yContributor.addName(y.name(YLanguage.NoLinguisticContent, StringUtils.trimToEmpty(trimToEmpty + " " + trimToEmpty2), "canonical"));
        yContributor.addName(y.name(YLanguage.NoLinguisticContent, trimToEmpty, "forenames"));
        yContributor.addName(y.name(YLanguage.NoLinguisticContent, trimToEmpty2, "surname"));
    }

    private static YDate convertDate(int i) {
        return new YDate("published", i, 0, 0, "");
    }

    static {
        try {
            FIRST_NAMES.addAll(IOUtils.readLines(BWmetaConverter.class.getResourceAsStream(COMMON_NAMES_RESOURCE), "UTF-8"));
        } catch (IOException e) {
            throw new SednoSystemException(e);
        }
    }
}
