package pl.edu.icm.synat.content.categorization.lingpipe.classifier.tfIdf;

import com.aliasi.classify.Classification;
import com.aliasi.classify.Classifier;
import com.aliasi.classify.ScoredClassification;
import com.aliasi.classify.TfIdfClassifierTrainer;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.TokenFeatureExtractor;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.Date;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.FastDateFormat;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import pl.edu.icm.synat.content.categorization.BaseCategorizationModule;
import pl.edu.icm.synat.content.categorization.exception.CategorizationException;
import pl.edu.icm.synat.content.categorization.lingpipe.classifier.properties.ClassifierProperties;
import pl.edu.icm.synat.content.categorization.model.CategorizationResult;
import pl.edu.icm.synat.content.categorization.model.CorpusEntry;
import pl.edu.icm.synat.content.categorization.model.PublicationMetadataDocument;

/* loaded from: input_file:pl/edu/icm/synat/content/categorization/lingpipe/classifier/tfIdf/TfIdfClassifier.class */
public class TfIdfClassifier implements BaseCategorizationModule<PublicationMetadataDocument> {
    private static final Log LOG = LogFactory.getLog(TfIdfClassifier.class);
    private static final String COULD_NOT_CATEGORIZE = null;
    private static final String CLOSE_ROUND_BRACKET_STRING = ")";
    private static final String FAILED_FILE = "] failed (file: ";
    private static final String DESERIALIZATION_OF_CLASSIFIER_FOR_LANGUAGE = "Deserialization of classifier for language [";
    private static final String DOES_NOT_EXIST = "] does not exist";
    private static final String SERIALIZED_CLASSIFIER_FOR_LANGUAGE = "Serialized classifier for language [";
    private static final String CLOSEBRACKET_STRING = "]";
    private static final String COULD_NOT_DELETE_FILE = "Could not delete file [";
    private static final String YYYY_MM_DD_HH_MM_SS_SSS = "yyyy-MM-dd HH:mm:ss.SSS";
    private static final String TRAINER = "trainer-";
    private static final String SERIALIZED = ".serialized";
    private static final String CLASSIFIER = "classifier-";
    private static final String SERIALIZATION_OF_CLASSIFIER_FAILED_LANGUAGE = "Serialization of classifier failed (language=";
    private static final String FILE = ", file=";
    private static final String CLASSIFIER_FOR_LANGUAGE = "Classifier for language [";
    private static final String LANGUAGE_NOT_SPECIFIED = "Language not specified";
    private static final String TF_IDF_CLASSIFIER_COULD_NOT_CLASSIFY_DOCUMENT_DOC_ID = "TfIdf classifier could not classify document (doc id:";
    private static final String CREATED_LAST_UPDATE = "] created (last update: ";
    private static final String TF_IDF_CLASSIFIER_FOR_LANGUAGE = "TfIdf classifier for language [";
    private ClassifierProperties properties;
    private File storageDir;
    private Classifier<CharSequence, ScoredClassification> classifier;
    private final Object classifierMutex = new Object();
    private final Object trainerMutex = new Object();

    public TfIdfClassifier(ClassifierProperties classifierProperties, File file) throws CategorizationException {
        if (StringUtils.isEmpty(classifierProperties.getLanguage())) {
            throw new CategorizationException(LANGUAGE_NOT_SPECIFIED);
        }
        this.properties = classifierProperties;
        this.storageDir = file;
        LOG.info(TF_IDF_CLASSIFIER_FOR_LANGUAGE + classifierProperties.getLanguage() + CREATED_LAST_UPDATE + (null != classifierProperties.getDate() ? FastDateFormat.getInstance(YYYY_MM_DD_HH_MM_SS_SSS).format(classifierProperties.getDate()) : null) + CLOSE_ROUND_BRACKET_STRING);
    }

    public CategorizationResult categorize(PublicationMetadataDocument publicationMetadataDocument) {
        Classifier<CharSequence, ScoredClassification> classifier = getClassifier();
        if (null == classifier) {
            LOG.warn(CLASSIFIER_FOR_LANGUAGE + getLanguage() + DOES_NOT_EXIST);
            return new CategorizationResult(getLanguage(), COULD_NOT_CATEGORIZE);
        }
        ScoredClassification classify = classifier.classify(publicationMetadataDocument.getContent());
        if (null != classify.bestCategory()) {
            return new CategorizationResult(getLanguage(), classify.bestCategory());
        }
        LOG.debug(TF_IDF_CLASSIFIER_COULD_NOT_CLASSIFY_DOCUMENT_DOC_ID + publicationMetadataDocument.getId() + CLOSE_ROUND_BRACKET_STRING);
        return new CategorizationResult(getLanguage(), COULD_NOT_CATEGORIZE);
    }

    public boolean isEmpty() {
        return getClassifier() == null;
    }

    public void learn(List<CorpusEntry<PublicationMetadataDocument>> list) {
        synchronized (this.trainerMutex) {
            TfIdfClassifierTrainer<CharSequence> tfIdfClassifierTrainer = new TfIdfClassifierTrainer<>(new TokenFeatureExtractor(new IndoEuropeanTokenizerFactory()));
            for (CorpusEntry<PublicationMetadataDocument> corpusEntry : list) {
                tfIdfClassifierTrainer.handle(corpusEntry.getDocument().getContent(), new Classification(corpusEntry.getCategory()));
            }
            Date date = this.properties.getDate();
            if (null == date) {
                date = new Date();
                this.properties.setDate(date);
            }
            serializeClassifier(tfIdfClassifierTrainer, getClassifierFile(date));
            try {
                serializeObject(tfIdfClassifierTrainer, getTrainerFile(date));
                LOG.info("Classifier re-trained after corpus changes for language [" + getLanguage() + CLOSEBRACKET_STRING);
            } catch (Exception e) {
                throw new CategorizationException("Could not serialize.", e);
            }
        }
    }

    private void deleteFile(File file) {
        if (file.exists() && file.delete()) {
            return;
        }
        LOG.warn(COULD_NOT_DELETE_FILE + file.getAbsolutePath() + CLOSEBRACKET_STRING);
    }

    private Classifier<CharSequence, ScoredClassification> getClassifier() {
        synchronized (this.classifierMutex) {
            if (null == this.classifier) {
                File file = null;
                if (null != this.properties.getDate()) {
                    file = getClassifierFile(this.properties.getDate());
                }
                if (null != file) {
                    try {
                        if (file.exists()) {
                            this.classifier = (Classifier) deserializeObject(file);
                        }
                    } catch (Exception e) {
                        throw new CategorizationException(DESERIALIZATION_OF_CLASSIFIER_FOR_LANGUAGE + getLanguage() + FAILED_FILE + file.getAbsolutePath() + CLOSE_ROUND_BRACKET_STRING, e);
                    }
                }
                LOG.debug(SERIALIZED_CLASSIFIER_FOR_LANGUAGE + getLanguage() + DOES_NOT_EXIST);
            }
        }
        return this.classifier;
    }

    private void serializeClassifier(TfIdfClassifierTrainer<CharSequence> tfIdfClassifierTrainer, File file) {
        synchronized (this.trainerMutex) {
            ObjectOutputStream objectOutputStream = null;
            try {
                try {
                    objectOutputStream = new ObjectOutputStream(new FileOutputStream(file));
                    tfIdfClassifierTrainer.compileTo(objectOutputStream);
                    IOUtils.closeQuietly(objectOutputStream);
                } catch (Exception e) {
                    throw new CategorizationException(SERIALIZATION_OF_CLASSIFIER_FAILED_LANGUAGE + getLanguage() + FILE + file.getAbsolutePath() + CLOSE_ROUND_BRACKET_STRING, e);
                }
            } catch (Throwable th) {
                IOUtils.closeQuietly(objectOutputStream);
                throw th;
            }
        }
    }

    private File getClassifierFile(Date date) {
        return new File(this.storageDir, CLASSIFIER + date.getTime() + SERIALIZED);
    }

    private File getTrainerFile(Date date) {
        return new File(this.storageDir, TRAINER + date.getTime() + SERIALIZED);
    }

    private void serializeObject(Object obj, File file) throws FileNotFoundException, IOException {
        ObjectOutputStream objectOutputStream = null;
        try {
            objectOutputStream = new ObjectOutputStream(new FileOutputStream(file));
            objectOutputStream.writeObject(obj);
            IOUtils.closeQuietly(objectOutputStream);
        } catch (Throwable th) {
            IOUtils.closeQuietly(objectOutputStream);
            throw th;
        }
    }

    private Object deserializeObject(File file) throws FileNotFoundException, IOException, ClassNotFoundException {
        ObjectInputStream objectInputStream = null;
        try {
            objectInputStream = new ObjectInputStream(new FileInputStream(file));
            Object readObject = objectInputStream.readObject();
            IOUtils.closeQuietly(objectInputStream);
            return readObject;
        } catch (Throwable th) {
            IOUtils.closeQuietly(objectInputStream);
            throw th;
        }
    }

    public String getLanguage() {
        return this.properties.getLanguage();
    }

    public File getStorageDir() {
        return this.storageDir;
    }
}
