package ws.palladian.extraction.content.evaluation;

import java.io.File;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ws.palladian.extraction.content.PalladianContentExtractor;
import ws.palladian.extraction.content.ReadabilityContentExtractor;
import ws.palladian.extraction.content.WebPageContentExtractor;
import ws.palladian.extraction.content.evaluation.BoilerpipeDataset;
import ws.palladian.extraction.content.evaluation.ContentExtractionDataset;
import ws.palladian.helper.ProgressMonitor;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.nlp.CharacterNGramSimilarity;
import ws.palladian.helper.nlp.JaroWinklerSimilarity;
import ws.palladian.helper.nlp.LevenshteinSimilarity;
import ws.palladian.helper.nlp.StringMetric;

/* loaded from: input_file:ws/palladian/extraction/content/evaluation/ContentExtractorEvaluation.class */
public final class ContentExtractorEvaluation {
    private static final Logger LOGGER = LoggerFactory.getLogger(ContentExtractorEvaluation.class);
    private static final List<StringMetric> SIMILARITIES = createSimilarities();
    private final List<WebPageContentExtractor> extractors = new ArrayList();
    private final List<ContentExtractionDataset> datasets = new ArrayList();

    private static List<StringMetric> createSimilarities() {
        ArrayList arrayList = new ArrayList();
        arrayList.add(new LevenshteinSimilarity());
        arrayList.add(new CharacterNGramSimilarity(5));
        arrayList.add(new JaroWinklerSimilarity());
        return arrayList;
    }

    public void addExtractor(WebPageContentExtractor webPageContentExtractor) {
        Validate.notNull(webPageContentExtractor, "extractor must not be null", new Object[0]);
        this.extractors.add(webPageContentExtractor);
    }

    public void addDataset(ContentExtractionDataset contentExtractionDataset) {
        Validate.notNull(contentExtractionDataset, "dataset must not be null", new Object[0]);
        this.datasets.add(contentExtractionDataset);
    }

    public void evaluate() {
        ProgressMonitor progressMonitor = new ProgressMonitor();
        long j = 0;
        while (this.datasets.iterator().hasNext()) {
            j += r0.next().size();
        }
        progressMonitor.startTask("ContentExtractorEvaluation", j * this.extractors.size());
        String str = "_contentExtractorEvaluation_" + System.currentTimeMillis() + ".csv";
        StringBuilder append = new StringBuilder().append("extractor;dataset;");
        Iterator<StringMetric> it = SIMILARITIES.iterator();
        while (it.hasNext()) {
            append.append(it.next().toString()).append(';');
        }
        append.append("startCorrect;endCorrect;time\n");
        FileHelper.appendFile(str, append);
        for (WebPageContentExtractor webPageContentExtractor : this.extractors) {
            for (ContentExtractionDataset contentExtractionDataset : this.datasets) {
                StringBuilder sb = new StringBuilder();
                double[] dArr = new double[SIMILARITIES.size()];
                int i = 0;
                int i2 = 0;
                StopWatch stopWatch = new StopWatch();
                for (ContentExtractionDataset.ContentExtractionPage contentExtractionPage : contentExtractionDataset) {
                    File htmlFile = contentExtractionPage.getHtmlFile();
                    try {
                        webPageContentExtractor.setDocument(htmlFile, true);
                    } catch (Exception e) {
                        LOGGER.warn("Encountered {} for {}", e, contentExtractionPage);
                    }
                    String trim = contentExtractionPage.getExpectedText().trim();
                    String trim2 = webPageContentExtractor.getResultText().trim();
                    double[] dArr2 = new double[SIMILARITIES.size()];
                    boolean z = false;
                    boolean z2 = false;
                    for (int i3 = 0; i3 < SIMILARITIES.size(); i3++) {
                        double similarity = SIMILARITIES.get(i3).getSimilarity(trim, trim2);
                        dArr2[i3] = similarity;
                        int i4 = i3;
                        dArr[i4] = dArr[i4] + similarity;
                    }
                    if (trim.length() > 25 && trim2.length() > 25) {
                        String substring = trim.substring(0, 25);
                        String substring2 = trim2.substring(0, 25);
                        String substring3 = trim.substring(trim.length() - 25, trim.length());
                        String substring4 = trim2.substring(trim2.length() - 25, trim2.length());
                        z = substring.equals(substring2);
                        z2 = substring3.equals(substring4);
                    }
                    if (z) {
                        i++;
                    }
                    if (z2) {
                        i2++;
                    }
                    sb.append(htmlFile.getName()).append(';');
                    for (double d : dArr2) {
                        sb.append(d).append(';');
                    }
                    sb.append(z).append(';');
                    sb.append(z2);
                    sb.append('\n');
                    progressMonitor.increment();
                }
                StringBuilder sb2 = new StringBuilder();
                sb2.append(webPageContentExtractor.getExtractorName()).append('\n');
                sb2.append(contentExtractionDataset.toString()).append("\n\n");
                sb2.append("Time: ").append(stopWatch.getElapsedTime());
                sb2.append(" (").append(stopWatch.getElapsedTimeString()).append(')').append('\n');
                sb2.append('\n');
                sb2.append("Average similarities:\n");
                int size = contentExtractionDataset.size();
                double d2 = (100.0d * i) / size;
                double d3 = (100.0d * i2) / size;
                for (int i5 = 0; i5 < SIMILARITIES.size(); i5++) {
                    sb2.append(SIMILARITIES.get(i5).toString());
                    sb2.append(": ");
                    sb2.append(dArr[i5] / size).append('\n');
                }
                sb2.append('\n');
                sb2.append("# pages: ").append(size).append('\n');
                sb2.append("% correct start: ").append(d2).append('\n');
                sb2.append("% correct end: ").append(d3).append('\n');
                sb2.append('\n');
                sb2.append("Individual similarities:\n");
                sb2.append((CharSequence) sb);
                FileHelper.writeToFile("ContentExtractorEvaluation_" + webPageContentExtractor.getExtractorName() + "_" + contentExtractionDataset.toString() + "_" + System.currentTimeMillis() + ".csv", sb2);
                StringBuilder sb3 = new StringBuilder();
                sb3.append(webPageContentExtractor.getExtractorName()).append(';');
                sb3.append(contentExtractionDataset.toString()).append(';');
                for (int i6 = 0; i6 < SIMILARITIES.size(); i6++) {
                    sb3.append(dArr[i6] / size).append(';');
                }
                sb3.append(d2).append(';');
                sb3.append(d3).append(';');
                sb3.append(stopWatch.getElapsedTime()).append('\n');
                FileHelper.appendFile(str, sb3);
            }
        }
    }

    public static void main(String[] strArr) {
        ContentExtractorEvaluation contentExtractorEvaluation = new ContentExtractorEvaluation();
        contentExtractorEvaluation.addExtractor(new ReadabilityContentExtractor());
        contentExtractorEvaluation.addExtractor(new PalladianContentExtractor());
        contentExtractorEvaluation.addDataset(new CleanevalDataset(new File("/Users/pk/Desktop/CleanEval")));
        contentExtractorEvaluation.addDataset(new TudContentExtractionDataset(new File("/Users/pk/Desktop/TUD_ContentExtractionDataset_2014-01-28")));
        contentExtractorEvaluation.addDataset(new BoilerpipeDataset(new File("/Users/pk/Desktop/L3S-GN1-20100130203947-00001"), BoilerpipeDataset.Mode.MAIN_CONTENT));
        contentExtractorEvaluation.addDataset(new BoilerpipeDataset(new File("/Users/pk/Desktop/L3S-GN1-20100130203947-00001"), BoilerpipeDataset.Mode.WHOLE_CONTENT));
        contentExtractorEvaluation.evaluate();
    }
}
