package ws.palladian.extraction.content.evaluation;

import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang3.Validate;
import ws.palladian.core.Instance;
import ws.palladian.extraction.content.evaluation.ContentExtractionDataset;
import ws.palladian.helper.collection.AbstractIterator;
import ws.palladian.helper.functional.Filters;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:ws/palladian/extraction/content/evaluation/CleanevalDataset.class */
public final class CleanevalDataset implements ContentExtractionDataset {
    private final List<File> txtFiles;

    public CleanevalDataset(File file) {
        Validate.notNull(file, "cleanEvalDirectory must not be null", new Object[0]);
        this.txtFiles = FileHelper.getFiles(file, Filters.fileExtension(new String[]{"txt"}));
    }

    @Override // java.lang.Iterable
    public Iterator<ContentExtractionDataset.ContentExtractionPage> iterator() {
        return new AbstractIterator<ContentExtractionDataset.ContentExtractionPage>() { // from class: ws.palladian.extraction.content.evaluation.CleanevalDataset.1
            Iterator<File> iterator;

            {
                this.iterator = CleanevalDataset.this.txtFiles.iterator();
            }

            /* JADX INFO: Access modifiers changed from: protected */
            /* renamed from: getNext, reason: merged with bridge method [inline-methods] */
            public ContentExtractionDataset.ContentExtractionPage m103getNext() throws AbstractIterator.Finished {
                if (!this.iterator.hasNext()) {
                    throw FINISHED;
                }
                final File next = this.iterator.next();
                return new ContentExtractionDataset.ContentExtractionPage() { // from class: ws.palladian.extraction.content.evaluation.CleanevalDataset.1.1
                    @Override // ws.palladian.extraction.content.evaluation.ContentExtractionDataset.ContentExtractionPage
                    public File getHtmlFile() {
                        return new File(next.getAbsolutePath().replace(".txt", ".html"));
                    }

                    @Override // ws.palladian.extraction.content.evaluation.ContentExtractionDataset.ContentExtractionPage
                    public String getExpectedText() {
                        try {
                            return CleanevalDataset.cleanup(FileHelper.readFileToString(next));
                        } catch (IOException e) {
                            throw new IllegalStateException("Could not read " + next + ", make sure, that there is a .txt file for each .html file in the directory.");
                        }
                    }

                    public String toString() {
                        return getHtmlFile().getPath();
                    }

                    @Override // ws.palladian.extraction.content.evaluation.ContentExtractionDataset.ContentExtractionPage
                    public String getUrl() {
                        return null;
                    }
                };
            }
        };
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static final String cleanup(String str) {
        return StringHelper.removeLineBreaks(StringHelper.replaceProtectedSpace(str.replaceAll("URL: [^\\s]+", Instance.NO_CATEGORY_DUMMY).replaceAll("<[^>]*>", Instance.NO_CATEGORY_DUMMY))).replaceAll("\\s+", " ").trim();
    }

    @Override // ws.palladian.extraction.content.evaluation.ContentExtractionDataset
    public int size() {
        return this.txtFiles.size();
    }

    public String toString() {
        return "CleanEvalDataset";
    }
}
