package pl.edu.icm.coansys.coansys.io.blog.crawler;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLClassLoader;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.edu.icm.coansys.coansys.io.blog.crawler.outputters.HBaseOutputter;
import pl.edu.icm.coansys.coansys.io.blog.crawler.sitesCheckers.SciamBlogsProcessor;
import pl.edu.icm.coansys.coansys.io.blog.crawler.sitesCheckers.ScienceBlogsDotComBlogsProcessor;
import pl.edu.icm.coansys.transformers.impl.HBaseClientThriftWriteFailureResistant;

/* loaded from: input_file:pl/edu/icm/coansys/coansys/io/blog/crawler/BlogDownloader.class */
public class BlogDownloader {
    static SiteDataProcessor[] processors = {new SciamBlogsProcessor(), new ScienceBlogsDotComBlogsProcessor()};
    private static final Logger log = LoggerFactory.getLogger(BlogDownloader.class);
    protected static final String O_HOST = "host";
    protected static final String O_PORT = "port";
    protected static final String O_TABLE = "table";
    protected static final String O_HELP = "help";

    protected static Options defineOptions() {
        Options options = new Options();
        Option option = new Option("h", O_HOST, true, "host with hbase");
        option.setRequired(true);
        options.addOption(option);
        Option option2 = new Option("t", O_TABLE, true, "table ");
        option2.setRequired(true);
        options.addOption(option2);
        Option option3 = new Option("po", O_PORT, true, "port of hbase thrift");
        option3.setRequired(false);
        options.addOption(option3);
        options.addOption(new Option("he", O_HELP, false, "this help message"));
        return options;
    }

    protected static void usage(Options options) {
        HelpFormatter helpFormatter = new HelpFormatter();
        PrintWriter printWriter = new PrintWriter(System.out);
        helpFormatter.printUsage(printWriter, 80, BlogDownloader.class.getSimpleName(), options);
        helpFormatter.printOptions(printWriter, 80, options, 1, 2);
        printWriter.flush();
    }

    public static void main(String[] strArr) {
        for (URL url : ((URLClassLoader) ClassLoader.getSystemClassLoader()).getURLs()) {
            System.out.println(url.getFile());
        }
        Options defineOptions = defineOptions();
        try {
            CommandLine parse = new GnuParser().parse(defineOptions, strArr);
            if (parse.hasOption(O_HELP)) {
                throw new ParseException("");
            }
            String optionValue = parse.getOptionValue(O_HOST);
            String optionValue2 = parse.getOptionValue(O_TABLE);
            HBaseClientThriftWriteFailureResistant hBaseClientThriftWriteFailureResistant = new HBaseClientThriftWriteFailureResistant(optionValue, Integer.parseInt(parse.getOptionValue(O_PORT, "9090")));
            hBaseClientThriftWriteFailureResistant.openConnection();
            CrawlConfig crawlConfig = new CrawlConfig();
            crawlConfig.setCrawlStorageFolder("/tmp/crawl");
            crawlConfig.setResumableCrawling(true);
            PageFetcher pageFetcher = new PageFetcher(crawlConfig);
            CrawlController crawlController = new CrawlController(crawlConfig, pageFetcher, new RobotstxtServer(new RobotstxtConfig(), pageFetcher));
            BlogCrawlConfig blogCrawlConfig = new BlogCrawlConfig();
            blogCrawlConfig.processors = processors;
            blogCrawlConfig.outputter = new HBaseOutputter(hBaseClientThriftWriteFailureResistant, optionValue2);
            crawlController.setCustomData(blogCrawlConfig);
            for (SiteDataProcessor siteDataProcessor : processors) {
                crawlController.addSeed("http://" + siteDataProcessor.domain);
            }
            crawlController.start(BlogCrawler.class, 2);
            hBaseClientThriftWriteFailureResistant.closeConnection();
        } catch (Exception e) {
            log.error(e.getMessage(), e);
        } catch (ParseException e2) {
            usage(defineOptions);
        }
    }
}
