package pl.edu.icm.cermine.metadata.extraction.enhancers;

import com.google.common.collect.Sets;
import com.itextpdf.text.xml.xmp.PdfProperties;
import java.util.Collection;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import pl.edu.icm.cermine.metadata.model.DocumentAuthor;
import pl.edu.icm.cermine.metadata.model.DocumentMetadata;
import pl.edu.icm.cermine.structure.model.BxChunk;
import pl.edu.icm.cermine.structure.model.BxDocument;
import pl.edu.icm.cermine.structure.model.BxLine;
import pl.edu.icm.cermine.structure.model.BxPage;
import pl.edu.icm.cermine.structure.model.BxWord;
import pl.edu.icm.cermine.structure.model.BxZone;
import pl.edu.icm.cermine.structure.model.BxZoneLabel;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.13.jar:pl/edu/icm/cermine/metadata/extraction/enhancers/AffiliationGeometricEnhancer.class */
public class AffiliationGeometricEnhancer extends AbstractSimpleEnhancer {
    private static final Pattern SKIPPED_LINE_PATTERN = Pattern.compile(".*(Email|Correspondence|Contributed equally|Dated|This work|Electronic address|The authors|Index|Received|Draft):?.*", 2);
    private static final Pattern EMAIL_SIMPLE_LINE_PATTERN = Pattern.compile("\\S+@[^,\\s]+", 2);
    private static final Pattern EMAIL_LINE_PATTERN = Pattern.compile("[\\{\\[]([^,\\s]+, ?)+[^,\\s]+ ?[\\}\\]]?@\\S+", 2);
    private static final Pattern FULL_INDEX_PATTERN = Pattern.compile("\\d{1,2}|\\*|∗|⁎|†|‡|§|\\(..?\\)|\\{|¶|\\[..?\\]|\\+|\\||⊥|\\^|#|α|β|λ|ξ|ψ|[a-f]|¹|²|³");
    private static final Pattern SIMPLE_INDEX_PATTERN = Pattern.compile("\\*|∗|⁎|†|‡|§|\\{|¶|\\+|\\||⊥|\\^|#|α|β|λ|ξ|ψ|¹|²|³");
    private final Set<String> headers = Sets.newHashSet("authoraffiliations", "authordetails", "affiliations");

    /* loaded from: input_file:WEB-INF/lib/cermine-impl-1.13.jar:pl/edu/icm/cermine/metadata/extraction/enhancers/AffiliationGeometricEnhancer$Processor.class */
    private static class Processor {
        private static final Pattern NONAFFILIATION_PATTERN = Pattern.compile("Correspondence:.+|Contributed equally", 2);
        private final Map<String, String> affiliations;
        private String affiliationRef;
        private final StringBuilder affiliationBuilder;
        private int emptyIndex;

        private Processor() {
            this.affiliations = new HashMap();
            this.affiliationRef = "";
            this.affiliationBuilder = new StringBuilder();
            this.emptyIndex = 100;
        }

        private void endAffiliation() {
            if (this.affiliationBuilder.length() > 0) {
                String sb = this.affiliationBuilder.toString();
                if (!NONAFFILIATION_PATTERN.matcher(sb).matches() && (isIndex(this.affiliationRef) || this.affiliationRef.isEmpty())) {
                    if (this.affiliationRef.isEmpty()) {
                        this.affiliationRef = "aff-" + this.emptyIndex;
                        this.emptyIndex++;
                    }
                    this.affiliations.put(this.affiliationRef, sb);
                }
                this.affiliationBuilder.setLength(0);
                this.affiliationRef = "";
            }
        }

        private boolean isIndex(String str) {
            return AffiliationGeometricEnhancer.FULL_INDEX_PATTERN.matcher(str).matches();
        }

        public void endWord() {
            this.affiliationBuilder.append(" ");
        }

        public void endZone() {
            endAffiliation();
        }

        public void addText(String str) {
            this.affiliationBuilder.append(str);
        }

        public Map<String, String> fetchAffiliations() {
            endAffiliation();
            return this.affiliations;
        }

        /* JADX INFO: Access modifiers changed from: private */
        public void addAffIndex(String str) {
            endAffiliation();
            this.affiliationRef += str;
        }
    }

    public AffiliationGeometricEnhancer() {
        setSearchedZoneLabels(BxZoneLabel.MET_AFFILIATION);
    }

    public void setHeaders(Collection<String> collection) {
        this.headers.clear();
        Iterator<String> it = collection.iterator();
        while (it.hasNext()) {
            this.headers.add(it.next().toLowerCase(Locale.ENGLISH));
        }
    }

    @Override // pl.edu.icm.cermine.metadata.extraction.enhancers.AbstractSimpleEnhancer
    protected Set<EnhancedField> getEnhancedFields() {
        return EnumSet.of(EnhancedField.AFFILIATION);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // pl.edu.icm.cermine.metadata.extraction.enhancers.AbstractSimpleEnhancer
    public boolean enhanceMetadata(BxDocument bxDocument, DocumentMetadata documentMetadata) {
        HashSet hashSet = new HashSet();
        Iterator<DocumentAuthor> it = documentMetadata.getAuthors().iterator();
        while (it.hasNext()) {
            hashSet.addAll(it.next().getAffiliationRefs());
        }
        boolean z = false;
        for (BxPage bxPage : filterPages(bxDocument)) {
            Processor processor = new Processor();
            for (BxZone bxZone : filterZones(bxPage)) {
                if (bxZone.getY() <= bxPage.getHeight() / 2.0d || !bxZone.hasPrev() || !bxZone.getPrev().toText().equals(PdfProperties.KEYWORDS)) {
                    boolean z2 = true;
                    Iterator<BxLine> it2 = bxZone.iterator();
                    while (it2.hasNext()) {
                        BxLine next = it2.next();
                        if (z2) {
                            z2 = false;
                            if (this.headers.contains(next.toText().toLowerCase(Locale.ENGLISH).replaceAll("[^0-9a-zA-Z]", ""))) {
                            }
                        }
                        if (!SKIPPED_LINE_PATTERN.matcher(next.toText()).matches() && !EMAIL_SIMPLE_LINE_PATTERN.matcher(next.toText()).matches() && !EMAIL_LINE_PATTERN.matcher(next.toText()).matches()) {
                            double d = 0.0d;
                            double d2 = 0.0d;
                            int i = 0;
                            Iterator<BxWord> it3 = next.iterator();
                            while (it3.hasNext()) {
                                Iterator<BxChunk> it4 = it3.next().iterator();
                                while (it4.hasNext()) {
                                    BxChunk next2 = it4.next();
                                    d += next2.getY();
                                    d2 += next2.getHeight();
                                    i++;
                                }
                            }
                            double d3 = d / i;
                            double d4 = d2 / i;
                            Iterator<BxWord> it5 = next.iterator();
                            while (it5.hasNext()) {
                                Iterator<BxChunk> it6 = it5.next().iterator();
                                while (it6.hasNext()) {
                                    BxChunk next3 = it6.next();
                                    double y = next3.getY();
                                    double height = next3.getHeight();
                                    if (SIMPLE_INDEX_PATTERN.matcher(next3.toText()).matches() || Math.abs(y - d3) + Math.abs(d4 - height) > 2.0d || (hashSet.contains(next3.toText()) && next3.getParent().childrenCount() < 3 && next3.getParent().equals(next3.getParent().getParent().getFirstChild()))) {
                                        processor.addAffIndex(next3.toText());
                                    } else {
                                        processor.addText(next3.toText());
                                    }
                                }
                                processor.endWord();
                            }
                        }
                    }
                    processor.endZone();
                }
            }
            Map<String, String> fetchAffiliations = processor.fetchAffiliations();
            if (!fetchAffiliations.isEmpty()) {
                for (Map.Entry<String, String> entry : fetchAffiliations.entrySet()) {
                    String replaceAll = entry.getValue().trim().replaceFirst("[Cc]orresponding [Aa]uthor.*$", "").trim().replaceFirst("[Cc]opyright is held by.*$", "").trim().replaceFirst("Full list of author information.*$", "").trim().replaceFirst("\\(?(January|February|March|April|May|June|July|August|September|October|November|December) .*$", "").trim().replaceFirst(" and$", "").trim().replaceFirst("\\S+@.*$", "").trim().replaceFirst("\\(?[Ee]mails?:.*$", "").trim().replaceFirst("\\(?[Ee]- *[Mm]ails?:.*$", "").trim().replaceFirst("http://.*$", "").trim().replaceFirst("www\\..*$", "").trim().replaceFirst("Acknowledgements.*$", "").trim().replaceFirst("[\\.,;]$", "").trim().replaceFirst("^[-\\)]", "").trim().replaceAll("(?<=[a-z])- (?=[a-z])", "");
                    if (!replaceAll.isEmpty() && replaceAll.matches(".*[A-Z].*") && replaceAll.matches(".*[a-z].*") && replaceAll.length() >= 12 && replaceAll.length() <= 500) {
                        String key = entry.getKey();
                        if (key.startsWith("aff-")) {
                            key = "";
                        }
                        if (!key.isEmpty()) {
                            documentMetadata.setAffiliationByIndex(key, replaceAll);
                        } else if (replaceAll.matches("^[1-9]\\. .*") && hashSet.contains(replaceAll.substring(0, 1))) {
                            while (!replaceAll.isEmpty()) {
                                String substring = replaceAll.substring(0, 1);
                                String trim = replaceAll.replaceFirst("^[1-9]\\. *", "").trim();
                                String replaceFirst = trim.replaceFirst(" *[1-9]\\. .*$", "");
                                String trim2 = trim.substring(replaceFirst.length()).trim();
                                if (trim2.isEmpty() || !hashSet.contains(trim2.substring(0, 1))) {
                                    if (trim.length() > 20) {
                                        documentMetadata.setAffiliationByIndex(substring, trim);
                                    }
                                    replaceAll = "";
                                } else {
                                    if (replaceFirst.length() > 20) {
                                        documentMetadata.setAffiliationByIndex(substring, replaceFirst);
                                    }
                                    replaceAll = trim2;
                                }
                            }
                        } else {
                            documentMetadata.addAffiliationToAllAuthors(replaceAll);
                        }
                        z = true;
                    }
                }
            }
        }
        return z;
    }
}
