package ws.palladian.extraction.content.evaluation;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.Validate;
import ws.palladian.classification.utils.ClassificationUtils;
import ws.palladian.core.Instance;
import ws.palladian.extraction.content.evaluation.ContentExtractionDataset;
import ws.palladian.helper.collection.AbstractIterator;
import ws.palladian.helper.functional.Predicates;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineAction;

/* loaded from: input_file:ws/palladian/extraction/content/evaluation/TudContentExtractionDataset.class */
public final class TudContentExtractionDataset implements ContentExtractionDataset {
    private final List<File> txtFiles;
    private final Map<String, String> fileUrlMapping;

    public TudContentExtractionDataset(File file) {
        Validate.notNull(file, "tudDatasetDirectory must not be null", new Object[0]);
        if (!file.isDirectory()) {
            throw new IllegalArgumentException(file + " is not a directory.");
        }
        this.fileUrlMapping = readFileUrlMapping(file);
        this.txtFiles = FileHelper.getFiles(file, Predicates.fileExtension(new String[]{".txt"}));
    }

    private static Map<String, String> readFileUrlMapping(File file) {
        final HashMap hashMap = new HashMap();
        File file2 = new File(file, "___index.csv");
        if (!file2.isFile()) {
            throw new IllegalStateException(file2 + " does not exist.");
        }
        FileHelper.performActionOnEveryLine(file2, new LineAction() { // from class: ws.palladian.extraction.content.evaluation.TudContentExtractionDataset.1
            public void performAction(String str, int i) {
                String[] split = str.split(ClassificationUtils.DEFAULT_SEPARATOR);
                if (split.length == 4) {
                    hashMap.put(split[0], split[1]);
                }
            }
        });
        return hashMap;
    }

    @Override // java.lang.Iterable
    public Iterator<ContentExtractionDataset.ContentExtractionPage> iterator() {
        return new AbstractIterator<ContentExtractionDataset.ContentExtractionPage>() { // from class: ws.palladian.extraction.content.evaluation.TudContentExtractionDataset.2
            Iterator<File> iterator;

            {
                this.iterator = TudContentExtractionDataset.this.txtFiles.iterator();
            }

            /* JADX INFO: Access modifiers changed from: protected */
            /* renamed from: getNext, reason: merged with bridge method [inline-methods] */
            public ContentExtractionDataset.ContentExtractionPage m121getNext() throws AbstractIterator.Finished {
                if (!this.iterator.hasNext()) {
                    throw FINISHED;
                }
                final File next = this.iterator.next();
                return new ContentExtractionDataset.ContentExtractionPage() { // from class: ws.palladian.extraction.content.evaluation.TudContentExtractionDataset.2.1
                    @Override // ws.palladian.extraction.content.evaluation.ContentExtractionDataset.ContentExtractionPage
                    public File getHtmlFile() {
                        return new File(next.getAbsolutePath().replace(".txt", ".html"));
                    }

                    @Override // ws.palladian.extraction.content.evaluation.ContentExtractionDataset.ContentExtractionPage
                    public String getExpectedText() {
                        try {
                            return FileHelper.readFileToString(next);
                        } catch (IOException e) {
                            throw new IllegalStateException("Could not read " + next + ", make sure, that there is a .txt file for each .html file in the directory.");
                        }
                    }

                    public String toString() {
                        return getHtmlFile().getPath();
                    }

                    @Override // ws.palladian.extraction.content.evaluation.ContentExtractionDataset.ContentExtractionPage
                    public String getUrl() {
                        return (String) TudContentExtractionDataset.this.fileUrlMapping.get(next.getName().replace(".txt", Instance.NO_CATEGORY_DUMMY));
                    }
                };
            }
        };
    }

    @Override // ws.palladian.extraction.content.evaluation.ContentExtractionDataset
    public int size() {
        return this.txtFiles.size();
    }

    public String toString() {
        return "TudContentExtractionDataset";
    }
}
