package ws.palladian.classification.webpage;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.StringInputStream;
import ws.palladian.helper.math.MathHelper;
import ws.palladian.retrieval.DocumentRetriever;
import ws.palladian.retrieval.parser.ParserException;
import ws.palladian.retrieval.parser.ParserFactory;
import ws.palladian.retrieval.resources.WebContent;
import ws.palladian.retrieval.resources.WebImage;

/* loaded from: input_file:ws/palladian/classification/webpage/ContentTypeClassifier.class */
public class ContentTypeClassifier extends RuleBasedPageClassifier<ContentType> {
    private static final Logger LOGGER = LoggerFactory.getLogger(ContentTypeClassifier.class);
    private static final String[] SEARCH_TRIGGERS = {"suchergebnis", "suchergebnisse", "search result", "search results"};

    public ContentType classify(Document document) {
        extractFeatures(document);
        LOGGER.info("starting to classify a new document");
        int i = 0;
        String[] strArr = {"mehr", "weiterlesen", "artikel lesen", "[...]"};
        ArrayList<WebContent> arrayList = new ArrayList();
        arrayList.addAll(getIngoingLinks());
        arrayList.addAll(getOutgoingLinks());
        if (getPageTitle().toLowerCase().indexOf("suche") > -1 || headlineContainsSearchTrigger()) {
            return ContentType.SEARCH_RESULTS;
        }
        if (getHighestNumberOfConsecutiveSentences() >= 4) {
            return ContentType.CONTENT;
        }
        for (WebContent webContent : arrayList) {
            for (String str : strArr) {
                if (webContent.getTitle().toLowerCase().indexOf(str) > -1) {
                    i++;
                }
                if (i >= 7) {
                    return ContentType.OVERVIEW;
                }
            }
        }
        int i2 = 0;
        Iterator<WebImage> it = getImages().iterator();
        while (it.hasNext()) {
            if (it.next().getSize() > 10000) {
                i2++;
            }
            if (i2 >= 10) {
                return ContentType.OVERVIEW;
            }
        }
        return getPaginationLinks().size() > 3 ? ContentType.OVERVIEW : (getHighestNumberOfConsecutiveSentences() < 4 || getPageSentences().length() < 1000 || getPageSentences().toLowerCase().indexOf("read the rest here:") > -1 || getPageSentences().toLowerCase().indexOf("read the original post:") > -1 || getPageSentences().toLowerCase().indexOf("continued here:") > -1 || getPageSentences().toLowerCase().indexOf("see the rest here:") > -1) ? ContentType.SPAM : ContentType.CONTENT;
    }

    private boolean headlineContainsSearchTrigger() {
        Iterator<String> it = getHeadlineContents().iterator();
        while (it.hasNext()) {
            String lowerCase = it.next().toLowerCase();
            for (String str : SEARCH_TRIGGERS) {
                if (lowerCase.equalsIgnoreCase(str) || lowerCase.indexOf(str + " ") > -1 || lowerCase.indexOf(str + ":") > -1) {
                    return true;
                }
            }
        }
        return false;
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // ws.palladian.classification.webpage.RuleBasedPageClassifier
    public ContentType classify(String str) {
        try {
            Document parse = ParserFactory.createHtmlParser().parse(new StringInputStream(str));
            parse.setDocumentURI("http://net-clipping.de");
            return classify(parse);
        } catch (ParserException e) {
            LOGGER.error(e.getMessage());
            return ContentType.UNKNOWN;
        }
    }

    public ContentType classify(File file) {
        return classify(new DocumentRetriever().getWebDocument(file.getPath()));
    }

    public ContentType classify(URL url) {
        return classify(new DocumentRetriever().getWebDocument(url.toString()));
    }

    public boolean isUseful(File file) {
        return isUseful(new DocumentRetriever().getWebDocument(file.getPath()));
    }

    public boolean isUseful(String str) {
        try {
            Document parse = ParserFactory.createHtmlParser().parse(new StringInputStream(str));
            parse.setDocumentURI("http://net-clipping.de");
            return isUseful(parse);
        } catch (ParserException e) {
            LOGGER.error(e.getMessage());
            return false;
        }
    }

    public boolean isUseful(Document document) {
        return classify(document).equals(ContentType.CONTENT);
    }

    public static void main(String[] strArr) throws IOException {
        StopWatch stopWatch = new StopWatch();
        ContentTypeClassifier contentTypeClassifier = new ContentTypeClassifier();
        HashMap hashMap = new HashMap();
        System.out.println(contentTypeClassifier.classify(new URL("http://www.openpr.de/news/508966/Hando-stellt-auf-der-Paracelsus-Messe-vom-11-13-02-2011-in-Wiesbaden-aus.html")));
        System.exit(0);
        HashMap hashMap2 = new HashMap();
        hashMap2.put("data/test/pagetype/overview/", ContentType.OVERVIEW);
        hashMap2.put("data/test/pagetype/spam/", ContentType.SPAM);
        hashMap2.put("data/test/pagetype/search/", ContentType.SEARCH_RESULTS);
        hashMap2.put("data/test/pagetype/content/", ContentType.CONTENT);
        for (Map.Entry entry : hashMap2.entrySet()) {
            for (File file : FileHelper.getFilesRecursive((String) entry.getKey())) {
                if (file.getAbsolutePath().indexOf(".svn") <= -1 && !file.isDirectory()) {
                    hashMap.put(file.getAbsolutePath(), entry.getValue());
                }
            }
        }
        int i = 0;
        int i2 = 0;
        int i3 = 0;
        int i4 = 0;
        for (Map.Entry entry2 : hashMap.entrySet()) {
            if (((String) entry2.getKey()).indexOf(".svn") <= -1) {
                if (contentTypeClassifier.isUseful(new File((String) entry2.getKey())) && ((ContentType) entry2.getValue()).equals(ContentType.CONTENT)) {
                    i2++;
                } else if (((ContentType) entry2.getValue()).equals(ContentType.CONTENT)) {
                    i3++;
                } else {
                    i4++;
                    i2++;
                }
                ContentType classify = contentTypeClassifier.classify(new File((String) entry2.getKey()));
                if (classify.equals(entry2.getValue())) {
                    i++;
                    LOGGER.info("CORRECT (as " + entry2.getValue() + "): " + ((String) entry2.getKey()));
                } else {
                    LOGGER.info("WRONG (as " + classify + ", should be " + entry2.getValue() + "): " + ((String) entry2.getKey()));
                }
            }
        }
        LOGGER.info("correctly classified: " + MathHelper.round((100 * i) / hashMap.size(), 2) + "%");
        LOGGER.info("correctly classified just useful: " + MathHelper.round((100 * i2) / hashMap.size(), 2) + "%");
        LOGGER.info("false negative rate: " + MathHelper.round((100 * i3) / hashMap.size(), 2) + "%");
        LOGGER.info("true negative rate: " + MathHelper.round((100 * i4) / hashMap.size(), 2) + "%");
        LOGGER.info("classification took " + stopWatch.getElapsedTimeString() + " on " + hashMap.size() + " documents");
    }
}
