package pl.edu.icm.yadda.tools.textcat;

import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.PropertyAccessor;
import pl.edu.icm.yadda.common.YaddaException;

/* loaded from: input_file:WEB-INF/lib/yadda-common-3.4.0-SNAPSHOT.jar:pl/edu/icm/yadda/tools/textcat/LanguageIdentifierBean.class */
public class LanguageIdentifierBean implements ILanguageIdentifier {
    private static final Logger log = LoggerFactory.getLogger(LanguageIdentifierBean.class);
    private static final String PROFILE_PROPERTIES = "pl/edu/icm/yadda/tools/textcat/profiles.properties";
    private static final String PROFILE_PREFIX = "pl/edu/icm/yadda/tools/textcat/profile/";
    private static final String PROFILE_SUFFIX = ".txt";
    public static final String LANG_NONE = "**";
    private Map<LangVariant, Profile> profiles = new HashMap();
    private double uncertaintyThreshold = 0.05d;

    /* loaded from: input_file:WEB-INF/lib/yadda-common-3.4.0-SNAPSHOT.jar:pl/edu/icm/yadda/tools/textcat/LanguageIdentifierBean$LangVariant.class */
    public static class LangVariant implements Serializable {
        private static final long serialVersionUID = 4522179133152273109L;
        private String lang;
        private String variant;

        public LangVariant(String str) {
            if (str == null) {
                throw new NullPointerException();
            }
            this.lang = str;
            this.variant = null;
            String[] split = str.split("\\.", 2);
            if (split.length == 2) {
                this.lang = split[0];
                this.variant = split[1];
            }
        }

        public LangVariant(String str, String str2) {
            this.lang = str;
            this.variant = str2;
        }

        public String getLang() {
            return this.lang;
        }

        public String getVariant() {
            return this.variant;
        }

        public int hashCode() {
            return (31 * ((31 * 1) + (this.lang == null ? 0 : this.lang.hashCode()))) + (this.variant == null ? 0 : this.variant.hashCode());
        }

        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null || getClass() != obj.getClass()) {
                return false;
            }
            LangVariant langVariant = (LangVariant) obj;
            if (this.lang == null) {
                if (langVariant.lang != null) {
                    return false;
                }
            } else if (!this.lang.equals(langVariant.lang)) {
                return false;
            }
            return this.variant == null ? langVariant.variant == null : this.variant.equals(langVariant.variant);
        }

        public String toString() {
            return this.variant == null ? this.lang : this.lang + '.' + this.variant;
        }
    }

    public void setUncertaintyThreshold(double d) {
        this.uncertaintyThreshold = d;
    }

    public double getUncertaintyThreshold() {
        return this.uncertaintyThreshold;
    }

    public LanguageIdentifierBean() throws IOException, YaddaException {
        initialize(null);
    }

    public LanguageIdentifierBean(String[] strArr) throws IOException, YaddaException {
        initialize(strArr);
    }

    private void initialize(String[] strArr) throws IOException, YaddaException {
        InputStream resourceAsStream = LanguageIdentifierBean.class.getClassLoader().getResourceAsStream(PROFILE_PROPERTIES);
        if (resourceAsStream == null) {
            throw new IOException("Cannot find resource pl/edu/icm/yadda/tools/textcat/profiles.properties");
        }
        Properties properties = new Properties();
        properties.load(resourceAsStream);
        String[] split = StringUtils.split(properties.getProperty("languages"), ' ');
        for (String str : split) {
            String lang = new LangVariant(str).getLang();
            if (!LanguagesIso639_1.isValid(lang)) {
                throw new YaddaException(PropertyAccessor.PROPERTY_KEY_PREFIX + lang + "] is not valid ISO 639-1 language code");
            }
        }
        if (strArr != null) {
            HashMap hashMap = new HashMap();
            for (String str2 : split) {
                String checkAndNormalize = LanguagesIso639_1.checkAndNormalize(new LangVariant(str2).getLang());
                List list = (List) hashMap.get(checkAndNormalize);
                if (list == null) {
                    list = new ArrayList();
                    hashMap.put(checkAndNormalize, list);
                }
                list.add(str2);
            }
            HashSet hashSet = new HashSet();
            for (String str3 : strArr) {
                String checkAndNormalize2 = LanguagesIso639_1.checkAndNormalize(str3);
                hashSet.add(checkAndNormalize2);
                if (!hashMap.containsKey(checkAndNormalize2)) {
                    log.warn("Language [" + checkAndNormalize2 + "] is not available for language identification");
                }
            }
            Iterator it = hashMap.keySet().iterator();
            HashSet hashSet2 = new HashSet();
            while (it.hasNext()) {
                String str4 = (String) it.next();
                List list2 = (List) hashMap.get(str4);
                if (hashSet.contains(str4)) {
                    Iterator it2 = list2.iterator();
                    while (it2.hasNext()) {
                        hashSet2.add((String) it2.next());
                    }
                } else {
                    log.info("Profile(s) [" + StringUtils.join(list2, ", ") + "] will not be created because its language does not belong to specified language set");
                    it.remove();
                }
            }
            split = (String[]) hashSet2.toArray(new String[this.profiles.size()]);
        }
        this.profiles = new HashMap();
        for (String str5 : split) {
            LangVariant langVariant = new LangVariant(str5);
            String str6 = PROFILE_PREFIX + str5.toLowerCase(Locale.ENGLISH) + PROFILE_SUFFIX;
            InputStream resourceAsStream2 = LanguageIdentifierBean.class.getClassLoader().getResourceAsStream(str6);
            if (resourceAsStream2 == null) {
                throw new IOException("Cannot find resource " + str6);
            }
            this.profiles.put(langVariant, new Profile(resourceAsStream2));
        }
        Arrays.sort(split);
        log.info("LanguageIdentifierBean created with following profiles: " + StringUtils.join(split, ", "));
    }

    @Override // pl.edu.icm.yadda.tools.textcat.ILanguageIdentifier
    public Set<String> getRegisteredLanguages() {
        HashSet hashSet = new HashSet();
        Iterator<LangVariant> it = this.profiles.keySet().iterator();
        while (it.hasNext()) {
            hashSet.add(it.next().getLang());
        }
        return hashSet;
    }

    @Override // pl.edu.icm.yadda.tools.textcat.ILanguageIdentifier
    public String classify(String str) {
        LangVariant classifyVariant = classifyVariant(str);
        return classifyVariant == null ? LANG_NONE : classifyVariant.getLang();
    }

    @Override // pl.edu.icm.yadda.tools.textcat.ILanguageIdentifier
    public LangVariant classifyVariant(String str) {
        if (str == null) {
            return null;
        }
        Profile profile = new Profile(str);
        LangVariant langVariant = null;
        int i = 160000;
        for (Map.Entry<LangVariant, Profile> entry : this.profiles.entrySet()) {
            LangVariant key = entry.getKey();
            int distance = entry.getValue().distance(profile);
            if (i > distance) {
                i = distance;
                langVariant = key;
            }
        }
        double d = 1.0d - ((1.0d * i) / 160000.0d);
        log.debug("Identified " + langVariant + " with overlap " + d);
        if (d < this.uncertaintyThreshold) {
            langVariant = null;
        }
        return langVariant;
    }

    @Override // pl.edu.icm.yadda.tools.textcat.ILanguageIdentifier
    public String classify(String str, String str2) {
        String classify = classify(str);
        return LANG_NONE.equals(classify) ? str2 : classify;
    }
}
