package pl.edu.icm.yadda.tools.textcat;

import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.builder.EqualsBuilder;
import org.apache.commons.lang.builder.HashCodeBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.edu.icm.ceon.commons.CeonGeneralException;

/* loaded from: input_file:WEB-INF/lib/ceon-commons-0.2.2.jar:pl/edu/icm/yadda/tools/textcat/LanguageIdentifierBean.class */
public class LanguageIdentifierBean implements LanguageIdentifier {
    private static final double UNCERTAINTY_THRESHOLD_INIT_VALUE = 0.05d;
    private static final Logger LOG = LoggerFactory.getLogger(LanguageIdentifierBean.class);
    private static final String LOG_WITH_OVERLAP = " with overlap ";
    private static final String LOG_IDENTIFIED = "Identified ";
    private static final String LANGUAGE_IDENTIFIER_BEAN_CREATED_WITH_FOLLOWING_PROFILES = "LanguageIdentifierBean created with following profiles: ";
    private static final String LOG_SEPARATOR = ", ";
    private static final String WILL_NOT_BE_CREATED_BECAUSE_ITS_LANGUAGE_DOES_NOT_BELONG_TO_SPECIFIED_LANGUAGE_SET = "] will not be created because its language does not belong to specified language set";
    private static final String PROFILE_S = "Profile(s) [";
    private static final String IS_NOT_AVAILABLE_FOR_LANGUAGE_IDENTIFICATION = "] is not available for language identification";
    private static final String LANGUAGE = "Language [";
    private static final String IS_NOT_VALID_ISO_639_1_LANGUAGE_CODE = "] is not valid ISO 639-1 language code";
    private static final String LANGUAGES_PROPERTY_NAME = "languages";
    private static final String CANNOT_FIND_RESOURCE = "Cannot find resource ";
    private static final String PROFILE_PROPERTIES = "pl/edu/icm/yadda/tools/textcat/profiles.properties";
    private static final String PROFILE_PREFIX = "pl/edu/icm/yadda/tools/textcat/profile/";
    private static final String PROFILE_SUFFIX = ".txt";
    public static final String LANG_NONE = "**";
    private Map<LangVariant, Profile> profiles = new HashMap();
    private double uncertaintyThreshold = UNCERTAINTY_THRESHOLD_INIT_VALUE;

    /* loaded from: input_file:WEB-INF/lib/ceon-commons-0.2.2.jar:pl/edu/icm/yadda/tools/textcat/LanguageIdentifierBean$LangVariant.class */
    public static class LangVariant implements Serializable {
        private static final String LANG_VARIANT_SPLIT_PATTERN = "\\.";
        private static final long serialVersionUID = 4522179133152273109L;
        private String lang;
        private String variant;

        public LangVariant(String str) {
            if (null == str) {
                throw new CategorizationException("Null pointer exception");
            }
            this.lang = str;
            this.variant = null;
            String[] split = str.split(LANG_VARIANT_SPLIT_PATTERN, 2);
            if (2 == split.length) {
                this.lang = split[0];
                this.variant = split[1];
            }
        }

        public LangVariant(String str, String str2) {
            this.lang = str;
            this.variant = str2;
        }

        public String getLang() {
            return this.lang;
        }

        public String getVariant() {
            return this.variant;
        }

        public int hashCode() {
            return HashCodeBuilder.reflectionHashCode(this);
        }

        public boolean equals(Object obj) {
            return EqualsBuilder.reflectionEquals(this, obj);
        }

        public String toString() {
            return null == this.variant ? this.lang : this.lang + '.' + this.variant;
        }
    }

    public void setUncertaintyThreshold(double d) {
        this.uncertaintyThreshold = d;
    }

    public double getUncertaintyThreshold() {
        return this.uncertaintyThreshold;
    }

    public LanguageIdentifierBean() throws IOException, CeonGeneralException {
        initialize(null);
    }

    public LanguageIdentifierBean(String[] strArr) throws IOException, CeonGeneralException {
        initialize(strArr);
    }

    private void initialize(String[] strArr) throws IOException, CeonGeneralException {
        InputStream resourceAsStream = LanguageIdentifierBean.class.getClassLoader().getResourceAsStream(PROFILE_PROPERTIES);
        if (resourceAsStream == null) {
            throw new IOException("Cannot find resource pl/edu/icm/yadda/tools/textcat/profiles.properties");
        }
        Properties properties = new Properties();
        properties.load(resourceAsStream);
        String[] split = StringUtils.split(properties.getProperty(LANGUAGES_PROPERTY_NAME), ' ');
        validateLanguages(split);
        if (null != strArr) {
            split = prepareLanguagesArray(strArr, split);
        }
        makeProfilesMap(split);
        Arrays.sort(split);
        LOG.info(LANGUAGE_IDENTIFIER_BEAN_CREATED_WITH_FOLLOWING_PROFILES + StringUtils.join(split, LOG_SEPARATOR));
    }

    @Override // pl.edu.icm.yadda.tools.textcat.LanguageIdentifier
    public Set<String> getRegisteredLanguages() {
        HashSet hashSet = new HashSet();
        Iterator<LangVariant> it = this.profiles.keySet().iterator();
        while (it.hasNext()) {
            hashSet.add(it.next().getLang());
        }
        return hashSet;
    }

    @Override // pl.edu.icm.yadda.tools.textcat.LanguageIdentifier
    public String classify(String str) {
        LangVariant classifyVariant = classifyVariant(str);
        return null == classifyVariant ? "**" : classifyVariant.getLang();
    }

    @Override // pl.edu.icm.yadda.tools.textcat.LanguageIdentifier
    public LangVariant classifyVariant(String str) {
        if (null == str) {
            return null;
        }
        return findArgMin(new Profile(str));
    }

    @Override // pl.edu.icm.yadda.tools.textcat.LanguageIdentifier
    public String classify(String str, String str2) {
        String classify = classify(str);
        return "**".equals(classify) ? str2 : classify;
    }

    protected void validateLanguages(String[] strArr) {
        for (String str : strArr) {
            String lang = new LangVariant(str).getLang();
            if (!LanguagesIso639_1.isValid(lang)) {
                throw new CategorizationException("[" + lang + IS_NOT_VALID_ISO_639_1_LANGUAGE_CODE);
            }
        }
    }

    protected String[] prepareLanguagesArray(String[] strArr, String[] strArr2) throws CeonGeneralException {
        HashMap hashMap = new HashMap();
        checkAndNormalize(strArr2, hashMap);
        HashSet hashSet = new HashSet();
        checkAndNormalize(strArr, hashMap, hashSet);
        HashSet hashSet2 = new HashSet();
        for (Map.Entry<String, List<String>> entry : hashMap.entrySet()) {
            String key = entry.getKey();
            List<String> value = entry.getValue();
            if (hashSet.contains(key)) {
                Iterator<String> it = value.iterator();
                while (it.hasNext()) {
                    hashSet2.add(it.next());
                }
            } else {
                LOG.info(PROFILE_S + StringUtils.join(value, LOG_SEPARATOR) + WILL_NOT_BE_CREATED_BECAUSE_ITS_LANGUAGE_DOES_NOT_BELONG_TO_SPECIFIED_LANGUAGE_SET);
                hashMap.remove(entry.getKey());
            }
        }
        return (String[]) hashSet2.toArray(new String[this.profiles.size()]);
    }

    protected void checkAndNormalize(String[] strArr, Map<String, List<String>> map, Set<String> set) throws CeonGeneralException {
        for (String str : strArr) {
            String checkAndNormalize = LanguagesIso639_1.checkAndNormalize(str);
            set.add(checkAndNormalize);
            if (!map.containsKey(checkAndNormalize)) {
                LOG.warn(LANGUAGE + checkAndNormalize + IS_NOT_AVAILABLE_FOR_LANGUAGE_IDENTIFICATION);
            }
        }
    }

    protected void checkAndNormalize(String[] strArr, Map<String, List<String>> map) throws CeonGeneralException {
        for (String str : strArr) {
            String checkAndNormalize = LanguagesIso639_1.checkAndNormalize(new LangVariant(str).getLang());
            List<String> list = map.get(checkAndNormalize);
            if (null == list) {
                list = new ArrayList();
                map.put(checkAndNormalize, list);
            }
            list.add(str);
        }
    }

    protected void makeProfilesMap(String[] strArr) throws IOException {
        this.profiles = new HashMap();
        for (String str : strArr) {
            LangVariant langVariant = new LangVariant(str);
            String str2 = PROFILE_PREFIX + str.toLowerCase(Locale.ENGLISH) + PROFILE_SUFFIX;
            InputStream resourceAsStream = LanguageIdentifierBean.class.getClassLoader().getResourceAsStream(str2);
            if (resourceAsStream == null) {
                throw new IOException(CANNOT_FIND_RESOURCE + str2);
            }
            this.profiles.put(langVariant, new Profile(resourceAsStream));
        }
    }

    protected LangVariant findArgMin(Profile profile) {
        LangVariant langVariant = null;
        int i = 160000;
        for (Map.Entry<LangVariant, Profile> entry : this.profiles.entrySet()) {
            LangVariant key = entry.getKey();
            int distance = entry.getValue().distance(profile);
            if (i > distance) {
                i = distance;
                langVariant = key;
            }
        }
        double d = 1.0d - ((1.0d * i) / 160000.0d);
        LOG.debug(LOG_IDENTIFIED + langVariant + LOG_WITH_OVERLAP + d);
        if (d < this.uncertaintyThreshold) {
            langVariant = null;
        }
        return langVariant;
    }
}
