package pl.edu.icm.yadda.categorization.classifier.lingpipe.tfidf;

import com.aliasi.classify.Classification;
import com.aliasi.classify.Classifier;
import com.aliasi.classify.ScoredClassification;
import com.aliasi.classify.TfIdfClassifierTrainer;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.TokenFeatureExtractor;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.commons.lang.time.FastDateFormat;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import pl.edu.icm.yadda.categorization.classifier.CategoryClassifier;
import pl.edu.icm.yadda.categorization.classifier.impl.CategoriesInfoImpl;
import pl.edu.icm.yadda.categorization.classifier.impl.ClassifierProperties;
import pl.edu.icm.yadda.categorization.corpus.CategoryHistory;
import pl.edu.icm.yadda.categorization.corpus.Corpus;
import pl.edu.icm.yadda.categorization.corpus.CorpusChangedEvent;
import pl.edu.icm.yadda.categorization.errors.CategorizationException;
import pl.edu.icm.yadda.common.utils.Utils;
import pl.edu.icm.yadda.service2.categorization.CDocument;
import pl.edu.icm.yadda.service2.categorization.CategoriesInfo;
import pl.edu.icm.yadda.service2.categorization.CategorizationResult;
import pl.edu.icm.yadda.service2.categorization.CorpusDocument;

/* loaded from: input_file:pl/edu/icm/yadda/categorization/classifier/lingpipe/tfidf/TfIdfClassifier.class */
public class TfIdfClassifier implements CategoryClassifier {
    private static final Log log = LogFactory.getLog(TfIdfClassifier.class);
    private ClassifierProperties properties;
    private File storageDir;
    private Classifier<CharSequence, ScoredClassification> _classifier;
    private Object classifierMutex = new Object();
    private Object trainerMutex = new Object();
    private Object categoriesMutex = new Object();
    private Set<String> _categories;

    public TfIdfClassifier(ClassifierProperties classifierProperties, File file) throws CategorizationException {
        if (Utils.emptyStr(classifierProperties.getLanguage())) {
            throw new CategorizationException("Language not specified");
        }
        this.properties = classifierProperties;
        this.storageDir = file;
        log.info("TfIdf classifier for language [" + getLanguage() + "] created (last update: " + (classifierProperties.getDate() != null ? FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss.SSS").format(classifierProperties.getDate()) : null) + ")");
    }

    public CategorizationResult categorize(CDocument cDocument) throws CategorizationException {
        Classifier<CharSequence, ScoredClassification> classifier = getClassifier();
        if (classifier == null) {
            log.warn("Classifier for language [" + getLanguage() + "] does not exist");
            return CategorizationResult.createNonScoreResult(cDocument.getId(), CategorizationResult.Status.EMPTY_LANGUAGE_CORPUS);
        }
        ScoredClassification classify = classifier.classify(cDocument.getText());
        if (classify.bestCategory() != null) {
            return new CategorizationResult(cDocument.getId(), classify.bestCategory(), getLanguage(), classify.score(0));
        }
        log.debug("TfIdf classifier could not classify document (doc id:" + cDocument.getId() + ")");
        return CategorizationResult.createNonScoreResult(cDocument.getId(), CategorizationResult.Status.NOT_CATEGORIZED);
    }

    public boolean isEmpty() throws CategorizationException {
        return getClassifier() == null;
    }

    public void corpusChanged(CorpusChangedEvent corpusChangedEvent) {
        try {
            log.debug("Processing of corpus changed event started...");
            Corpus corpus = corpusChangedEvent.getCorpus();
            Date date = this.properties.getDate();
            boolean z = false;
            Iterator it = corpus.getHistory(date, corpusChangedEvent.getEndDate(), getLanguage()).getHistory().iterator();
            while (it.hasNext()) {
                if (((CategoryHistory) it.next()).getOperation() == CategoryHistory.Operation.DELETE) {
                    z = true;
                }
                if (z) {
                    break;
                }
            }
            if (z) {
                date = null;
            }
            Iterator iterate = corpus.iterate((String) null, getLanguage(), date, corpusChangedEvent.getEndDate());
            synchronized (this.trainerMutex) {
                TfIdfClassifierTrainer<CharSequence> tfIdfClassifierTrainer = null;
                if (!z) {
                    tfIdfClassifierTrainer = deserializeTrainer();
                }
                boolean z2 = false;
                boolean z3 = z;
                if (tfIdfClassifierTrainer == null) {
                    tfIdfClassifierTrainer = new TfIdfClassifierTrainer<>(new TokenFeatureExtractor(new IndoEuropeanTokenizerFactory()));
                    z2 = true;
                }
                while (iterate.hasNext()) {
                    CorpusDocument corpusDocument = (CorpusDocument) iterate.next();
                    tfIdfClassifierTrainer.handle(corpusDocument.getText(), new Classification(corpusDocument.getCategory()));
                    if (z2) {
                        z2 = false;
                    }
                    if (!z3) {
                        z3 = true;
                    }
                }
                setCategories(tfIdfClassifierTrainer.categories());
                if (!z3) {
                    log.info("No changes in corpus for language [" + getLanguage() + "]");
                    return;
                }
                if (!z2) {
                    serializeClassifier(tfIdfClassifierTrainer, getClassifierFile(corpusChangedEvent.getEndDate()));
                    serializeObject(tfIdfClassifierTrainer, getTrainerFile(corpusChangedEvent.getEndDate()));
                    log.info("Classifier re-trained after corpus changes for language [" + getLanguage() + "]");
                }
                Date date2 = this.properties.getDate();
                this.properties.setDate(corpusChangedEvent.getEndDate());
                synchronized (this.classifierMutex) {
                    this._classifier = null;
                }
                if (date2 != null) {
                    deleteFile(getClassifierFile(date2));
                    deleteFile(getTrainerFile(date2));
                }
                log.debug("Processing of corpus changed event finished");
            }
        } catch (Exception e) {
            log.error("Error occured while processing corpus changes", e);
        }
    }

    private void deleteFile(File file) {
        if (!file.exists() || file.delete()) {
            return;
        }
        log.warn("Could not delete file [" + file.getAbsolutePath() + "]");
    }

    private Classifier<CharSequence, ScoredClassification> getClassifier() throws CategorizationException {
        Classifier<CharSequence, ScoredClassification> classifier;
        synchronized (this.classifierMutex) {
            if (this._classifier == null) {
                File file = null;
                if (this.properties.getDate() != null) {
                    file = getClassifierFile(this.properties.getDate());
                    if (!file.exists()) {
                        file = null;
                    }
                }
                if (file == null) {
                    log.debug("Serialized classifier for language [" + getLanguage() + "] does not exist");
                } else {
                    try {
                        this._classifier = (Classifier) deserializeObject(file);
                    } catch (Exception e) {
                        throw new CategorizationException("Deserialization of classifier for language [" + getLanguage() + "] failed (file: " + file.getAbsolutePath() + ")");
                    }
                }
            }
            classifier = this._classifier;
        }
        return classifier;
    }

    private void serializeClassifier(TfIdfClassifierTrainer<CharSequence> tfIdfClassifierTrainer, File file) throws CategorizationException {
        synchronized (this.trainerMutex) {
            ObjectOutputStream objectOutputStream = null;
            try {
                try {
                    ObjectOutputStream objectOutputStream2 = new ObjectOutputStream(new FileOutputStream(file));
                    tfIdfClassifierTrainer.compileTo(objectOutputStream2);
                    objectOutputStream2.close();
                    objectOutputStream = null;
                    if (0 != 0) {
                        try {
                            objectOutputStream.close();
                        } catch (Exception e) {
                        }
                    }
                } catch (Exception e2) {
                    throw new CategorizationException("Serialization of classifier failed (language=" + getLanguage() + ", file=" + file.getAbsolutePath() + ")", e2);
                }
            } catch (Throwable th) {
                if (objectOutputStream != null) {
                    try {
                        objectOutputStream.close();
                    } catch (Exception e3) {
                    }
                }
                throw th;
            }
        }
    }

    private TfIdfClassifierTrainer<CharSequence> deserializeTrainer() throws CategorizationException {
        File file = null;
        if (this.properties.getDate() != null) {
            file = getTrainerFile(this.properties.getDate());
            if (!file.exists()) {
                file = null;
            }
        }
        if (file == null) {
            log.debug("Serialized trainer for language [" + getLanguage() + "] does not exist");
            return null;
        }
        try {
            return (TfIdfClassifierTrainer) deserializeObject(file);
        } catch (Exception e) {
            throw new CategorizationException("Deserialization of trainer failed (file: " + file.getAbsolutePath() + ")");
        }
    }

    private File getClassifierFile(Date date) {
        return new File(this.storageDir, "classifier-" + date.getTime() + ".serialized");
    }

    private File getTrainerFile(Date date) {
        return new File(this.storageDir, "trainer-" + date.getTime() + ".serialized");
    }

    private void serializeObject(Object obj, File file) throws Exception {
        ObjectOutputStream objectOutputStream = null;
        try {
            ObjectOutputStream objectOutputStream2 = new ObjectOutputStream(new FileOutputStream(file));
            objectOutputStream2.writeObject(obj);
            objectOutputStream2.close();
            objectOutputStream = null;
            if (0 != 0) {
                try {
                    objectOutputStream.close();
                } catch (Exception e) {
                }
            }
        } catch (Throwable th) {
            if (objectOutputStream != null) {
                try {
                    objectOutputStream.close();
                } catch (Exception e2) {
                }
            }
            throw th;
        }
    }

    private Object deserializeObject(File file) throws Exception {
        ObjectInputStream objectInputStream = null;
        try {
            ObjectInputStream objectInputStream2 = new ObjectInputStream(new FileInputStream(file));
            Object readObject = objectInputStream2.readObject();
            objectInputStream2.close();
            objectInputStream = null;
            if (0 != 0) {
                try {
                    objectInputStream.close();
                } catch (Exception e) {
                }
            }
            return readObject;
        } catch (Throwable th) {
            if (objectInputStream != null) {
                try {
                    objectInputStream.close();
                } catch (Exception e2) {
                }
            }
            throw th;
        }
    }

    public String getLanguage() throws CategorizationException {
        return this.properties.getLanguage();
    }

    public File getStorageDir() {
        return this.storageDir;
    }

    private void setCategories(Set<String> set) {
        synchronized (this.categoriesMutex) {
            this._categories = new HashSet(set);
        }
    }

    public CategoriesInfo getCategoriesInfo() throws CategorizationException {
        CategoriesInfoImpl categoriesInfoImpl;
        synchronized (this.categoriesMutex) {
            if (this._categories == null) {
                TfIdfClassifierTrainer<CharSequence> deserializeTrainer = deserializeTrainer();
                this._categories = deserializeTrainer == null ? new HashSet() : new HashSet(deserializeTrainer.categories());
            }
            categoriesInfoImpl = new CategoriesInfoImpl();
            String language = getLanguage();
            Iterator<String> it = this._categories.iterator();
            while (it.hasNext()) {
                categoriesInfoImpl.addCategory(language, it.next());
            }
        }
        return categoriesInfoImpl;
    }
}
