package pl.edu.icm.synat.content.categorization.lingpipe;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Required;
import pl.edu.icm.synat.common.ListingResult;
import pl.edu.icm.synat.content.categorization.CategorizationModule;
import pl.edu.icm.synat.content.categorization.corpus.CorpusQueryConditions;
import pl.edu.icm.synat.content.categorization.corpus.CorpusStorage;
import pl.edu.icm.synat.content.categorization.exception.CategorizationException;
import pl.edu.icm.synat.content.categorization.lingpipe.classifier.factory.ClassifierFactory;
import pl.edu.icm.synat.content.categorization.lingpipe.classifier.tfIdf.TfIdfClassifier;
import pl.edu.icm.synat.content.categorization.model.CategorizationResult;
import pl.edu.icm.synat.content.categorization.model.CorpusEntry;
import pl.edu.icm.synat.content.categorization.model.PublicationMetadataDocument;
import pl.edu.icm.synat.tools.language.identifier.impl.LanguageIdentifierBean;

/* loaded from: input_file:pl/edu/icm/synat/content/categorization/lingpipe/LingpipeCategorizationModule.class */
public class LingpipeCategorizationModule implements CategorizationModule<PublicationMetadataDocument> {
    private static final String DATE_FORMAT = "yyyy-MM-dd hh:mm:ss.SS";
    private static final String BEGINNG_OF_TIME = "1979-07-12 00:00:00.00";
    private static final String DOES_NOT_EXIST_DOCID = "] does not exist (docid:";
    private static final String CATEGORY_CLASSIFIER_FOR_LANGUAGE = "Category classifier for language [";
    private static final String LANGUAGE_OF_DOCUMENT_COULD_NOT_BE_RECOGNIZED_DOC_ID = "Language of document could not be recognized (doc id:";
    private static final String CLOSING_BRACKET_STRING = ")";
    private static final Logger LOG = LoggerFactory.getLogger(LingpipeCategorizationModule.class);
    private CorpusStorage corpusStorage = null;
    private final LanguageIdentifierBean languageIdentifierBean;
    private ClassifierFactory<PublicationMetadataDocument> classifierFactory;

    public LingpipeCategorizationModule() throws CategorizationException {
        try {
            this.languageIdentifierBean = new LanguageIdentifierBean();
        } catch (Exception e) {
            throw new CategorizationException("Error occurred while creating LingpipeCategorizationModule", e);
        }
    }

    public CategorizationResult categorize(PublicationMetadataDocument publicationMetadataDocument) {
        String language = publicationMetadataDocument.getLanguage();
        if (StringUtils.isEmpty(language)) {
            language = classifyLanguage(publicationMetadataDocument.getContent());
        }
        if (null == language) {
            LOG.warn(LANGUAGE_OF_DOCUMENT_COULD_NOT_BE_RECOGNIZED_DOC_ID + publicationMetadataDocument.getId() + CLOSING_BRACKET_STRING);
            return new CategorizationResult(language, (String) null);
        }
        String lowerCase = language.trim().toLowerCase();
        TfIdfClassifier tfIdfClassifier = (TfIdfClassifier) this.classifierFactory.getClassifier2(lowerCase, true);
        if (null != tfIdfClassifier) {
            return tfIdfClassifier.categorize(publicationMetadataDocument);
        }
        LOG.warn(CATEGORY_CLASSIFIER_FOR_LANGUAGE + lowerCase + DOES_NOT_EXIST_DOCID + publicationMetadataDocument.getId() + CLOSING_BRACKET_STRING);
        return new CategorizationResult(lowerCase, (String) null);
    }

    public void flushTrainingData() {
        Iterator it = listAllCorpusEntries().getItems().iterator();
        while (it.hasNext()) {
            this.corpusStorage.removeCorpusEntry((String) it.next());
        }
    }

    public void trainModule(List<CorpusEntry<PublicationMetadataDocument>> list) {
        for (CorpusEntry<PublicationMetadataDocument> corpusEntry : list) {
            this.corpusStorage.storeCorpusEntry(corpusEntry);
            this.corpusStorage.loadCorpusEntry(corpusEntry.getId());
        }
        for (Map.Entry<String, List<CorpusEntry<PublicationMetadataDocument>>> entry : listCorpusEntriesByLanguge(listAllCorpusEntries()).entrySet()) {
            ((TfIdfClassifier) this.classifierFactory.getClassifier2(entry.getKey(), true)).learn(entry.getValue());
        }
    }

    protected ListingResult<String> listAllCorpusEntries() {
        Date date = null;
        try {
            date = new SimpleDateFormat(DATE_FORMAT).parse(BEGINNG_OF_TIME);
        } catch (ParseException e) {
            LOG.debug(e.getMessage());
        }
        return this.corpusStorage.listCorpusEntries(new CorpusQueryConditions().withTimestampFrom(date), Integer.MAX_VALUE);
    }

    protected Map<String, List<CorpusEntry<PublicationMetadataDocument>>> listCorpusEntriesByLanguge(ListingResult<String> listingResult) {
        HashMap hashMap = new HashMap();
        Iterator it = listingResult.getItems().iterator();
        while (it.hasNext()) {
            CorpusEntry loadCorpusEntry = this.corpusStorage.loadCorpusEntry((String) it.next());
            String classifyLanguage = null == loadCorpusEntry.getDocument().getLanguage() ? classifyLanguage(loadCorpusEntry.getDocument().getContent()) : loadCorpusEntry.getDocument().getLanguage();
            if (null != hashMap.get(classifyLanguage)) {
                ((List) hashMap.get(classifyLanguage)).add(loadCorpusEntry);
            } else {
                hashMap.put(classifyLanguage, new ArrayList());
            }
        }
        return hashMap;
    }

    String classifyLanguage(String str) {
        String classify = this.languageIdentifierBean.classify(str);
        if (LanguageIdentifierBean.LANG_NONE.equals(classify)) {
            return null;
        }
        return classify;
    }

    @Required
    public void setCorpusStorage(CorpusStorage corpusStorage) {
        this.corpusStorage = corpusStorage;
    }

    @Required
    public void setClassifierFactory(ClassifierFactory<PublicationMetadataDocument> classifierFactory) {
        this.classifierFactory = classifierFactory;
        this.classifierFactory.init();
    }
}
