package pl.edu.icm.cermine.metadata.extraction.enhancers;

import com.google.common.collect.Sets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumSet;
import java.util.Locale;
import java.util.Set;
import pl.edu.icm.cermine.metadata.model.DocumentMetadata;
import pl.edu.icm.cermine.structure.model.BxPage;
import pl.edu.icm.cermine.structure.model.BxZone;
import pl.edu.icm.cermine.structure.model.BxZoneLabel;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.13-SNAPSHOT.jar:pl/edu/icm/cermine/metadata/extraction/enhancers/TitleEnhancer.class */
public class TitleEnhancer extends AbstractSimpleEnhancer {
    private final Set<String> types = Sets.newHashSet("case report", "case study", "clinical study", "debate", "editorial", "forum", "full research paper", "methodology", "original article", "original research", "primary research", "research", "research article", "research paper", "review", "review article", "short article", "short paper", "study", "study protocol", "technical note");

    public TitleEnhancer() {
        setSearchedZoneLabels(BxZoneLabel.MET_TITLE);
        setSearchedFirstPageOnly(true);
    }

    @Override // pl.edu.icm.cermine.metadata.extraction.enhancers.AbstractSimpleEnhancer
    protected Set<EnhancedField> getEnhancedFields() {
        return EnumSet.of(EnhancedField.TITLE);
    }

    @Override // pl.edu.icm.cermine.metadata.extraction.enhancers.AbstractSimpleEnhancer
    protected boolean enhanceMetadata(BxPage bxPage, DocumentMetadata documentMetadata) {
        ArrayList arrayList = new ArrayList();
        for (BxZone bxZone : filterZones(bxPage)) {
            if (!this.types.contains(bxZone.toText().toLowerCase(Locale.ENGLISH).trim()) && !bxZone.toText().toLowerCase(Locale.ENGLISH).startsWith("sponsored document from") && (!bxZone.hasPrev() || bxZone.getPrev().childrenCount() != 1 || !bxZone.getPrev().toText().toLowerCase(Locale.ENGLISH).startsWith("sponsored document from"))) {
                if (!bxZone.hasNext() || !bxZone.getNext().toText().toLowerCase(Locale.ENGLISH).replaceAll("[^a-z]", "").startsWith("journalhomepage")) {
                    arrayList.add(bxZone);
                }
            }
        }
        Collections.sort(arrayList, new Comparator<BxZone>() { // from class: pl.edu.icm.cermine.metadata.extraction.enhancers.TitleEnhancer.1
            @Override // java.util.Comparator
            public int compare(BxZone bxZone2, BxZone bxZone3) {
                return Double.compare(bxZone3.getChild(0).getHeight(), bxZone2.getChild(0).getHeight());
            }
        });
        if (arrayList.isEmpty()) {
            return false;
        }
        BxZone bxZone2 = (BxZone) arrayList.get(0);
        double height = bxZone2.getChild(0).getHeight();
        while (bxZone2.hasPrev() && BxZoneLabel.MET_TITLE.equals(bxZone2.getPrev().getLabel()) && Math.abs(height - bxZone2.getPrev().getChild(0).getHeight()) < 0.5d) {
            bxZone2 = bxZone2.getPrev();
        }
        StringBuilder sb = new StringBuilder(bxZone2.toText());
        while (bxZone2.hasNext() && Math.abs(height - bxZone2.getNext().getChild(0).getHeight()) < 0.5d) {
            if (!BxZoneLabel.MET_TITLE.equals(bxZone2.getNext().getLabel())) {
                if (bxZone2.getNext().childrenCount() != 1 || !bxZone2.getNext().getFontNames().equals(bxZone2.getFontNames())) {
                    break;
                }
                bxZone2 = bxZone2.getNext();
                sb.append(" ");
                sb.append(bxZone2.toText());
            } else {
                bxZone2 = bxZone2.getNext();
                sb.append(" ");
                sb.append(bxZone2.toText());
            }
        }
        if (sb.toString().isEmpty()) {
            return false;
        }
        documentMetadata.setTitle(sb.toString().trim().replaceAll("\n", " "));
        return true;
    }
}
