package pl.edu.icm.cermine.content.headers;

import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.commons.cli.HelpFormatter;
import pl.edu.icm.cermine.content.model.BxContentStructure;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.structure.model.BxBounds;
import pl.edu.icm.cermine.structure.model.BxChunk;
import pl.edu.icm.cermine.structure.model.BxDocument;
import pl.edu.icm.cermine.structure.model.BxLine;
import pl.edu.icm.cermine.structure.model.BxPage;
import pl.edu.icm.cermine.structure.model.BxWord;
import pl.edu.icm.cermine.structure.model.BxZone;
import pl.edu.icm.cermine.structure.model.BxZoneLabel;
import pl.edu.icm.cermine.tools.CountMap;
import pl.edu.icm.cermine.tools.statistics.Population;
import pl.edu.icm.cermine.tools.timeout.TimeoutRegister;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.10.jar:pl/edu/icm/cermine/content/headers/HeuristicContentHeadersExtractor.class */
public class HeuristicContentHeadersExtractor implements ContentHeadersExtractor {
    private static final double CAND_MAX_LENGTH_ZSCORE = -0.1d;
    private static final double CAND_MAX_LENGTH_ZSCORE_2 = 1.0d;
    private static final double CAND_MIN_HEIGHT_ZSCORE = -1.0d;
    private static final double OUTL_HEIGHT_ZSCORE = 0.5d;
    private static final double OUTL_FONT_ZSCORE = 0.5d;
    private static final double OUTL_DIST_ZSCORE = 0.4d;
    private static final double OUTL_INDENT_ZSCORE = 0.5d;
    private static final double MAX_HEIGHT_SIMILARITY = 1.0d;
    private static final int MAX_SIMILAR_LINES_COUNT = 50;
    private static final int MAX_HEADER_LINE_COUNT = 5;
    private final double candMaxLengthZScore = CAND_MAX_LENGTH_ZSCORE;
    private final double candMaxLengthZScore2 = 1.0d;
    private final double candMinHeightZScore = CAND_MIN_HEIGHT_ZSCORE;
    private final double outlHeightZScore = 0.5d;
    private final double outlFontZScore = 0.5d;
    private final double outlDistanceZScore = OUTL_DIST_ZSCORE;
    private final double outlIndentZScore = 0.5d;
    private final double maxHeightSimilarity = 1.0d;
    private final int maxSimilarLinesCount = 50;
    private final int maxHeaderLineCount = 5;
    private final SimpleHeadersClusterizer headersClusterizer = new SimpleHeadersClusterizer();
    private final HeaderLinesCompletener headerLinesCompletener = new HeaderLinesCompletener();

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r7v0, types: [pl.edu.icm.cermine.content.headers.HeuristicContentHeadersExtractor] */
    @Override // pl.edu.icm.cermine.content.headers.ContentHeadersExtractor
    public BxContentStructure extractHeaders(BxDocument bxDocument) throws AnalysisException {
        Population population = new Population();
        Population population2 = new Population();
        Population population3 = new Population();
        Population population4 = new Population();
        Population population5 = new Population();
        HashSet<BxLine> hashSet = new HashSet();
        Iterator<BxPage> it = bxDocument.iterator();
        while (it.hasNext()) {
            Iterator<BxZone> it2 = it.next().iterator();
            while (it2.hasNext()) {
                BxZone next = it2.next();
                if (next.getLabel().equals(BxZoneLabel.BODY_CONTENT) || next.getLabel().equals(BxZoneLabel.GEN_BODY)) {
                    Iterator<BxLine> it3 = next.iterator();
                    while (it3.hasNext()) {
                        BxLine next2 = it3.next();
                        population.addObservation(next2.getHeight());
                        population4.addObservation(next2.getWidth());
                        population5.addObservation(next2.getX());
                        if (next2.hasPrev() && next2.getY() - next2.getPrev().getY() > 0.0d) {
                            population3.addObservation(next2.getY() - next2.getPrev().getY());
                        }
                        population2.addObservation(getFontIndex(next2));
                        if (isFirstInZone(next2) && looksLikeHeader(next2)) {
                            hashSet.add(next2);
                        }
                        TimeoutRegister.get().check();
                    }
                }
            }
        }
        HashSet hashSet2 = new HashSet();
        for (BxLine bxLine : hashSet) {
            if (shouldBeRemoved(bxLine, population, population2, population3, population5)) {
                hashSet2.add(bxLine);
            }
            if (population4.getZScore(bxLine.getWidth()) > CAND_MAX_LENGTH_ZSCORE) {
                hashSet2.add(bxLine);
            }
        }
        hashSet.removeAll(hashSet2);
        hashSet2.clear();
        HashSet hashSet3 = new HashSet();
        ArrayList newArrayList = Lists.newArrayList(hashSet);
        Set newHashSet = Sets.newHashSet();
        if (!newArrayList.isEmpty()) {
            newHashSet = ((BxLine) newArrayList.get(0)).getParent().getParent().getParent().getFontNames();
        }
        CountMap countMap = new CountMap();
        for (int i = 0; i < newArrayList.size(); i++) {
            countMap.add(((BxLine) newArrayList.get(i)).getMostPopularFontName());
        }
        for (Map.Entry entry : countMap.getSortedEntries(3)) {
            if (Math.abs(population2.getZScore(getFontIndex((String) entry.getKey(), newHashSet))) > 0.5d) {
                hashSet3.add(entry.getKey());
            }
            TimeoutRegister.get().check();
        }
        Iterator<BxPage> it4 = bxDocument.iterator();
        while (it4.hasNext()) {
            Iterator<BxZone> it5 = it4.next().iterator();
            while (it5.hasNext()) {
                BxZone next3 = it5.next();
                if (next3.getLabel().equals(BxZoneLabel.BODY_CONTENT) || next3.getLabel().equals(BxZoneLabel.GEN_BODY)) {
                    Iterator<BxLine> it6 = next3.iterator();
                    while (it6.hasNext()) {
                        BxLine next4 = it6.next();
                        if (looksLikeHeader(next4) && hashSet3.contains(next4.getMostPopularFontName())) {
                            hashSet.add(next4);
                        }
                    }
                }
            }
        }
        for (BxLine bxLine2 : hashSet) {
            if (shouldBeRemoved(bxLine2, population, population2, population3, population5)) {
                hashSet2.add(bxLine2);
            }
            if (population4.getZScore(bxLine2.getWidth()) > 1.0d) {
                hashSet2.add(bxLine2);
            }
        }
        hashSet.removeAll(hashSet2);
        hashSet2.clear();
        for (BxLine bxLine3 : hashSet) {
            int i2 = 0;
            for (BxLine bxLine4 : hashSet) {
                if (!bxLine3.equals(bxLine4) && areSimilar(bxLine3, bxLine4)) {
                    i2++;
                }
            }
            if (i2 == 0 || i2 > 50) {
                hashSet2.add(bxLine3);
                for (BxLine bxLine5 : hashSet) {
                    if (areSimilar(bxLine3, bxLine5)) {
                        hashSet2.add(bxLine5);
                    }
                }
            }
        }
        hashSet.removeAll(hashSet2);
        ArrayList arrayList = new ArrayList();
        Iterator<BxPage> it7 = bxDocument.iterator();
        while (it7.hasNext()) {
            Iterator<BxZone> it8 = it7.next().iterator();
            while (it8.hasNext()) {
                Iterator<BxLine> it9 = it8.next().iterator();
                while (it9.hasNext()) {
                    BxLine next5 = it9.next();
                    if (hashSet.contains(next5)) {
                        arrayList.add(next5);
                    }
                }
            }
        }
        int[] clusterLines = this.headersClusterizer.clusterLines(arrayList);
        HashSet hashSet4 = new HashSet();
        for (int i3 = 0; i3 < clusterLines.length; i3++) {
            int i4 = clusterLines[i3];
            if (hashSet4.size() < 3) {
                hashSet4.add(Integer.valueOf(i4));
            }
            if (!hashSet4.contains(Integer.valueOf(i4))) {
                hashSet.remove(arrayList.get(i3));
            }
        }
        BxContentStructure bxContentStructure = new BxContentStructure();
        BxLine bxLine6 = null;
        Iterator<BxPage> it10 = bxDocument.iterator();
        while (it10.hasNext()) {
            BxPage next6 = it10.next();
            Iterator<BxZone> it11 = next6.iterator();
            while (it11.hasNext()) {
                BxZone next7 = it11.next();
                if (next7.getLabel().equals(BxZoneLabel.BODY_CONTENT) || next7.getLabel().equals(BxZoneLabel.GEN_BODY)) {
                    Iterator<BxLine> it12 = next7.iterator();
                    while (it12.hasNext()) {
                        BxLine next8 = it12.next();
                        if (hashSet.contains(next8)) {
                            bxContentStructure.addFirstHeaderLine(next6, next8);
                            bxLine6 = next8;
                        } else if (next7.getLabel().equals(BxZoneLabel.BODY_CONTENT) || next7.getLabel().equals(BxZoneLabel.GEN_BODY)) {
                            if (bxLine6 == null) {
                                bxLine6 = new BxLine().addWord(new BxWord().addChunk(new BxChunk(new BxBounds(), HelpFormatter.DEFAULT_LONG_OPT_PREFIX)));
                                bxContentStructure.addFirstHeaderLine(next6, bxLine6);
                            }
                            bxContentStructure.addContentLine(bxLine6, next8);
                        }
                    }
                }
            }
        }
        this.headerLinesCompletener.completeLines(bxContentStructure);
        return bxContentStructure;
    }

    private double getFontIndex(BxLine bxLine) {
        return getFontIndex(bxLine.getMostPopularFontName(), bxLine.getParent().getParent().getParent().getFontNames());
    }

    private double getFontIndex(String str, Set<String> set) {
        Collections.sort(Lists.newArrayList(set));
        return r0.indexOf(str);
    }

    private boolean isFirstInZone(BxLine bxLine) {
        return (bxLine.hasPrev() && bxLine.getParent() == bxLine.getPrev().getParent()) ? false : true;
    }

    private boolean looksLikeHeader(BxLine bxLine) {
        String text = bxLine.toText();
        return text.matches("^[A-Z].*") || text.matches("^[1-9].*[a-zA-Z].*") || text.matches("^[a-h]\\).*[a-zA-Z].*");
    }

    private boolean looksLikeEquation(BxLine bxLine) {
        return bxLine.toText().contains("=");
    }

    private boolean looksLikeFigure(BxLine bxLine) {
        return bxLine.toText().toLowerCase().matches("fig\\.? .*") || bxLine.toText().toLowerCase().matches("figure .*");
    }

    private boolean looksLikeTable(BxLine bxLine) {
        return bxLine.toText().toLowerCase().matches("table .*");
    }

    private boolean containsMostlyLetters(BxLine bxLine) {
        double d = 0.0d;
        for (char c : bxLine.toText().toCharArray()) {
            if (Character.isLetter(c)) {
                d += 1.0d;
            }
        }
        return 2.0d * d > ((double) bxLine.toText().length());
    }

    private boolean containsWord(BxLine bxLine) {
        return bxLine.toText().toLowerCase().matches(".*[a-z][a-z][a-z][a-z].*");
    }

    private boolean startsWithLargeNumber(BxLine bxLine) {
        return bxLine.toText().matches("[0-9][0-9].*");
    }

    private boolean areSimilar(BxLine bxLine, BxLine bxLine2) {
        return bxLine.getMostPopularFontName().equals(bxLine2.getMostPopularFontName()) && Math.abs(bxLine.getHeight() - bxLine2.getHeight()) < 1.0d;
    }

    private boolean shouldBeRemoved(BxLine bxLine, Population population, Population population2, Population population3, Population population4) {
        if (bxLine.getMostPopularFontName() == null || population.getZScore(bxLine.getHeight()) < CAND_MIN_HEIGHT_ZSCORE || looksLikeEquation(bxLine) || looksLikeFigure(bxLine) || looksLikeTable(bxLine) || !containsMostlyLetters(bxLine) || !containsWord(bxLine) || startsWithLargeNumber(bxLine)) {
            return true;
        }
        if (population.getZScore(bxLine.getHeight()) < 0.5d && Math.abs(population2.getZScore(getFontIndex(bxLine))) < 0.5d && ((!bxLine.hasPrev() || population3.getZScore(bxLine.getY() - bxLine.getPrev().getY()) < OUTL_DIST_ZSCORE) && Math.abs(population4.getZScore(bxLine.getX())) < 0.5d)) {
            return true;
        }
        int i = 0;
        BxLine bxLine2 = bxLine;
        while (bxLine2.hasNext()) {
            bxLine2 = bxLine2.getNext();
            if (bxLine2.toText().matches("[A-Z].*")) {
                return false;
            }
            int i2 = i;
            i++;
            if (i2 == 5) {
                return true;
            }
        }
        return false;
    }
}
