package ws.palladian.extraction.location;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ws.palladian.core.Annotation;
import ws.palladian.core.ImmutableAnnotation;
import ws.palladian.core.Tagger;
import ws.palladian.extraction.entity.StringTagger;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineAction;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:ws/palladian/extraction/location/EntityPreprocessingTagger.class */
public class EntityPreprocessingTagger implements Tagger {
    private static final Logger LOGGER = LoggerFactory.getLogger(EntityPreprocessingTagger.class);
    private static final Tagger TAGGER = StringTagger.INSTANCE;
    private final Map<String, Double> caseDictionary;
    private final double lowercaseThreshold;
    private final int longAnnotationSplit;

    public EntityPreprocessingTagger(InputStream inputStream, double d, int i) {
        this.caseDictionary = loadCaseDictionary(inputStream, d);
        this.lowercaseThreshold = d;
        this.longAnnotationSplit = i;
    }

    private static final Map<String, Double> loadCaseDictionary(InputStream inputStream, final double d) {
        final HashMap hashMap = new HashMap();
        FileHelper.performActionOnEveryLine(inputStream, new LineAction() { // from class: ws.palladian.extraction.location.EntityPreprocessingTagger.1
            public void performAction(String str, int i) {
                String[] split = str.split("\t");
                double parseDouble = Double.parseDouble(split[1]) / Double.parseDouble(split[2]);
                if (parseDouble >= d) {
                    hashMap.put(split[0], Double.valueOf(parseDouble));
                }
            }
        });
        return hashMap;
    }

    @Override // ws.palladian.core.Tagger
    public List<Annotation> getAnnotations(String str) {
        List<? extends Annotation> annotations = TAGGER.getAnnotations(str);
        List<Annotation> arrayList = new ArrayList<>();
        Set filterSet = CollectionHelper.filterSet(getInSentenceCandidates(str, annotations), str2 -> {
            return getLowercaseRatio(str2) <= this.lowercaseThreshold;
        });
        if (filterSet.isEmpty()) {
            arrayList.addAll(annotations);
            return arrayList;
        }
        for (Annotation annotation : annotations) {
            String value = annotation.getValue();
            if (isWithinSentence(str, annotation)) {
                arrayList.add(annotation);
            } else if (filterSet.contains(value)) {
                LOGGER.trace("Skip '{}', because it appears within a sentence", value);
                arrayList.add(annotation);
            } else {
                String[] split = value.split("\\s");
                if (split.length == 1) {
                    double lowercaseRatio = getLowercaseRatio(value);
                    if (lowercaseRatio > this.lowercaseThreshold) {
                        LOGGER.debug("Drop '{}' because of lc/uc ratio of {}", value, Double.valueOf(lowercaseRatio));
                    } else {
                        arrayList.add(annotation);
                    }
                } else {
                    LOGGER.trace("Start correcting '{}'", value);
                    int i = 0;
                    String str3 = value;
                    int length = split.length;
                    int i2 = 0;
                    while (true) {
                        if (i2 >= length) {
                            break;
                        }
                        String str4 = split[i2];
                        double lowercaseRatio2 = getLowercaseRatio(str4);
                        if (lowercaseRatio2 <= this.lowercaseThreshold) {
                            LOGGER.trace("Stop correcting '{}' at '{}' because of lc/uc ratio of {}", new Object[]{value, str3, Double.valueOf(lowercaseRatio2)});
                            break;
                        }
                        i += str4.length() + 1;
                        if (i >= value.length()) {
                            break;
                        }
                        str3 = value.substring(i);
                        if (filterSet.contains(str3)) {
                            LOGGER.trace("Stop correcting '{}' as '{}' is contained within sentence", value, str3);
                            break;
                        }
                        i2++;
                    }
                    if (i >= value.length()) {
                        LOGGER.debug("Drop '{}' completely because of lc/uc ratio", value);
                    } else if (i > 0) {
                        LOGGER.debug("Correct '{}' to '{}' because of lc/uc ratios", value, str3);
                        arrayList.add(new ImmutableAnnotation(annotation.getStartPosition() + i, str3, annotation.getTag()));
                    } else {
                        arrayList.add(annotation);
                    }
                }
            }
        }
        LOGGER.debug("Reduced from {} to {} with case dictionary", Integer.valueOf(annotations.size()), Integer.valueOf(arrayList.size()));
        if (this.longAnnotationSplit > 0) {
            List<Annotation> longAnnotationSplit = getLongAnnotationSplit(arrayList, this.longAnnotationSplit);
            LOGGER.debug("Extracted additional {} annotations by splitting", Integer.valueOf(longAnnotationSplit.size()));
            arrayList.addAll(longAnnotationSplit);
        }
        return arrayList;
    }

    List<Annotation> getLongAnnotationSplit(List<Annotation> list, int i) {
        int startPosition;
        ArrayList arrayList = new ArrayList();
        for (Annotation annotation : list) {
            String[] split = annotation.getValue().split("\\s");
            if (split.length >= i) {
                ArrayList arrayList2 = new ArrayList();
                for (String str : split) {
                    if (getLowercaseRatio(str) < this.lowercaseThreshold) {
                        arrayList2.add(str);
                    } else if (arrayList2.size() > 0) {
                        String join = StringUtils.join(arrayList2, " ");
                        if (join.length() > 1) {
                            arrayList.add(new ImmutableAnnotation(annotation.getStartPosition() + annotation.getValue().indexOf(join), join, StringTagger.CANDIDATE_TAG));
                        }
                        arrayList2.clear();
                    }
                }
                if (arrayList2.size() > 0) {
                    String join2 = StringUtils.join(arrayList2, " ");
                    if (!join2.equals(annotation.getValue()) && join2.length() > 1) {
                        arrayList.add(new ImmutableAnnotation(annotation.getStartPosition() + annotation.getValue().indexOf(join2), join2, StringTagger.CANDIDATE_TAG));
                    }
                }
            }
            String normalizeQuotes = StringHelper.normalizeQuotes(annotation.getValue());
            if (normalizeQuotes.contains("-") || normalizeQuotes.contains("&")) {
                for (String str2 : normalizeQuotes.split("[-&]")) {
                    String trim = str2.trim();
                    if (StringHelper.startsUppercase(trim) && (startPosition = annotation.getStartPosition() + annotation.getValue().indexOf(trim)) >= 0) {
                        arrayList.add(new ImmutableAnnotation(startPosition, trim, StringTagger.CANDIDATE_TAG));
                    }
                }
            }
        }
        return arrayList;
    }

    private static Set<String> getInSentenceCandidates(String str, List<? extends Annotation> list) {
        HashSet hashSet = new HashSet();
        for (Annotation annotation : list) {
            if (isWithinSentence(str, annotation)) {
                String value = annotation.getValue();
                LOGGER.trace("Add '{}' to in-sentence candidates", value);
                hashSet.add(value);
            }
        }
        return hashSet;
    }

    private static boolean isWithinSentence(String str, Annotation annotation) {
        int startPosition = annotation.getStartPosition();
        return str.substring(Math.max(0, startPosition - 10), startPosition).matches(".*[A-Za-z0-9,]+\\s");
    }

    private double getLowercaseRatio(String str) {
        Double d = this.caseDictionary.get(str.toLowerCase());
        if (d == null) {
            return 0.0d;
        }
        return d.doubleValue();
    }

    public String correctCapitalization(String str) {
        String[] split = str.split("\\s");
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < split.length; i++) {
            String str2 = split[i];
            if (i > 0) {
                sb.append(" ");
            }
            String str3 = str2;
            if (i == split.length - 1 && str2.endsWith(".")) {
                str3 = str2.substring(0, str2.length() - 1);
            }
            if (getLowercaseRatio(str3) > this.lowercaseThreshold) {
                str2 = str2.toLowerCase();
            }
            sb.append(str2);
        }
        return sb.toString();
    }
}
