package ws.palladian.extraction.content.evaluation;

import java.io.File;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.Validate;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import ws.palladian.extraction.content.evaluation.ContentExtractionDataset;
import ws.palladian.helper.collection.AbstractIterator;
import ws.palladian.helper.html.XPathHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineAction;
import ws.palladian.retrieval.parser.DocumentParser;
import ws.palladian.retrieval.parser.ParserException;
import ws.palladian.retrieval.parser.ParserFactory;

/* loaded from: input_file:ws/palladian/extraction/content/evaluation/BoilerpipeDataset.class */
public final class BoilerpipeDataset implements ContentExtractionDataset {
    private final File datasetDirectory;
    private final Map<String, String> uuidUrlMapping;
    private final Mode mode;
    private final DocumentParser htmlParser = ParserFactory.createHtmlParser();

    /* loaded from: input_file:ws/palladian/extraction/content/evaluation/BoilerpipeDataset$Mode.class */
    public enum Mode {
        MAIN_CONTENT("//text()[ancestor::*[contains(@class,'x-nc-sel')][1]/@class='x-nc-sel2']"),
        WHOLE_CONTENT("//text()[ancestor::*[contains(@class,'x-nc-sel')][1]/@class='x-nc-sel2' or ancestor::*[contains(@class,'x-nc-sel')][1]/@class='x-nc-sel5']");

        private final String xPath;

        Mode(String str) {
            this.xPath = str;
        }
    }

    public BoilerpipeDataset(File file, Mode mode) {
        Validate.notNull(file, "boilerpipeDatasetDirectory must not be null", new Object[0]);
        Validate.notNull(mode, "mode must not be null", new Object[0]);
        if (!file.isDirectory()) {
            throw new IllegalArgumentException(file + " is not a directory.");
        }
        this.datasetDirectory = file;
        this.uuidUrlMapping = readFileUrlMapping(file);
        this.mode = mode;
    }

    private static Map<String, String> readFileUrlMapping(File file) {
        File file2 = new File(file, "url-mapping.txt");
        if (!file2.isFile()) {
            throw new IllegalStateException(file2 + " does not exist.");
        }
        final HashMap hashMap = new HashMap();
        final Pattern compile = Pattern.compile("<urn:uuid:([a-z0-9\\-]*?)>\\s(.*?)");
        FileHelper.performActionOnEveryLine(file2, new LineAction() { // from class: ws.palladian.extraction.content.evaluation.BoilerpipeDataset.1
            public void performAction(String str, int i) {
                Matcher matcher = compile.matcher(str);
                if (matcher.matches() && matcher.groupCount() == 2) {
                    hashMap.put(matcher.group(1), matcher.group(2));
                }
            }
        });
        return hashMap;
    }

    @Override // java.lang.Iterable
    public Iterator<ContentExtractionDataset.ContentExtractionPage> iterator() {
        return new AbstractIterator<ContentExtractionDataset.ContentExtractionPage>() { // from class: ws.palladian.extraction.content.evaluation.BoilerpipeDataset.2
            Iterator<String> uuidIterator;

            {
                this.uuidIterator = BoilerpipeDataset.this.uuidUrlMapping.keySet().iterator();
            }

            /* JADX INFO: Access modifiers changed from: protected */
            /* renamed from: getNext, reason: merged with bridge method [inline-methods] */
            public ContentExtractionDataset.ContentExtractionPage m112getNext() throws AbstractIterator.Finished {
                if (!this.uuidIterator.hasNext()) {
                    throw FINISHED;
                }
                final String next = this.uuidIterator.next();
                return new ContentExtractionDataset.ContentExtractionPage() { // from class: ws.palladian.extraction.content.evaluation.BoilerpipeDataset.2.1
                    @Override // ws.palladian.extraction.content.evaluation.ContentExtractionDataset.ContentExtractionPage
                    public File getHtmlFile() {
                        return new File(BoilerpipeDataset.this.datasetDirectory, "/original/" + next + ".html");
                    }

                    @Override // ws.palladian.extraction.content.evaluation.ContentExtractionDataset.ContentExtractionPage
                    public String getExpectedText() {
                        File file = new File(BoilerpipeDataset.this.datasetDirectory, "/annotated/" + next + ".html");
                        try {
                            Document parse = BoilerpipeDataset.this.htmlParser.parse(file);
                            StringBuilder sb = new StringBuilder();
                            Iterator it = XPathHelper.getXhtmlNodes(parse, BoilerpipeDataset.this.mode.xPath).iterator();
                            while (it.hasNext()) {
                                sb.append(((Node) it.next()).getTextContent()).append(" ");
                            }
                            return sb.toString();
                        } catch (ParserException e) {
                            throw new IllegalStateException("Could not read or parse " + file + ".");
                        }
                    }

                    public String toString() {
                        return getHtmlFile().getPath();
                    }

                    @Override // ws.palladian.extraction.content.evaluation.ContentExtractionDataset.ContentExtractionPage
                    public String getUrl() {
                        return (String) BoilerpipeDataset.this.uuidUrlMapping.get(next);
                    }
                };
            }
        };
    }

    @Override // ws.palladian.extraction.content.evaluation.ContentExtractionDataset
    public int size() {
        return this.uuidUrlMapping.size();
    }

    public String toString() {
        return "L3S-GN1-Dataset(" + this.mode + ")";
    }
}
