package pl.edu.icm.coansys.coansys.io.blog.crawler.sitesCheckers;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import pl.edu.icm.coansys.coansys.io.blog.crawler.SiteDataProcessor;
import pl.edu.icm.coansys.models.DocumentProtos;
import pl.edu.icm.model.bwmeta.utils.IdGenerator;

/* loaded from: input_file:pl/edu/icm/coansys/coansys/io/blog/crawler/sitesCheckers/ScienceBlogsDotComBlogsProcessor.class */
public class ScienceBlogsDotComBlogsProcessor extends SiteDataProcessor {
    Map<String, Map<String, String>> mapBlogToMapPersonId;
    IdGenerator idGen;

    public ScienceBlogsDotComBlogsProcessor() {
        this.domain = "scienceblogs.com";
        this.mapBlogToMapPersonId = new ConcurrentHashMap();
        this.idGen = new IdGenerator();
    }

    @Override // pl.edu.icm.coansys.coansys.io.blog.crawler.SiteDataProcessor
    public boolean isSiteEntry(WebURL webURL) {
        if (!webURL.getURL().contains(this.domain)) {
            return false;
        }
        ArrayList arrayList = new ArrayList();
        for (String str : webURL.getPath().split("/")) {
            if (StringUtils.isNotBlank(str)) {
                arrayList.add(str);
            }
        }
        return arrayList.size() == 5 && !((String) arrayList.get(1)).equals("about");
    }

    String parseAuthorName(HtmlParseData htmlParseData, Document document) {
        Iterator it = document.getElementsByClass("byline").iterator();
        while (it.hasNext()) {
            Element element = (Element) it.next();
            if (element.classNames().contains("vcard") && element.classNames().contains("author")) {
                Iterator it2 = element.getElementsByClass("fn").iterator();
                while (it2.hasNext()) {
                    Iterator it3 = ((Element) it2.next()).getElementsByTag("a").iterator();
                    if (it3.hasNext()) {
                        return ((Element) it3.next()).text();
                    }
                }
            }
        }
        return null;
    }

    public String blogUrlForPage(Page page) {
        String str = null;
        for (String str2 : page.getWebURL().getPath().split("/")) {
            if (str == null && StringUtils.isNotBlank(str2)) {
                str = str2;
            }
        }
        if (str == null) {
            return null;
        }
        return page.getWebURL().getURL().split(page.getWebURL().getPath())[0] + "/" + str;
    }

    @Override // pl.edu.icm.coansys.coansys.io.blog.crawler.SiteDataProcessor
    public DocumentProtos.DocumentWrapper.Builder processPageFromSite(Page page) {
        if (!(page.getParseData() instanceof HtmlParseData)) {
            return null;
        }
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        Document parse = Jsoup.parse(htmlParseData.getHtml());
        DocumentProtos.DocumentWrapper.Builder newBuilder = DocumentProtos.DocumentWrapper.newBuilder();
        String blogEntryUrlToRowId = blogEntryUrlToRowId(page.getWebURL().getURL());
        newBuilder.setRowId(blogEntryUrlToRowId);
        addContent(newBuilder, page);
        DocumentProtos.DocumentMetadata.Builder documentMetadataBuilder = newBuilder.getDocumentMetadataBuilder();
        documentMetadataBuilder.setType(DocumentProtos.DocumentMetadata.Type.BLOG_ENTRY);
        documentMetadataBuilder.setKey(blogEntryUrlToRowId);
        if (htmlParseData.getMetaTags().containsKey("og:description")) {
            documentMetadataBuilder.addDocumentAbstractBuilder().setText((String) htmlParseData.getMetaTags().get("og:description"));
        }
        DocumentProtos.BasicMetadata.Builder basicMetadataBuilder = documentMetadataBuilder.getBasicMetadataBuilder();
        if (htmlParseData.getMetaTags().containsKey("og:title")) {
            basicMetadataBuilder.addTitleBuilder().setText((String) htmlParseData.getMetaTags().get("og:title"));
        } else if (StringUtils.isNotBlank(htmlParseData.getTitle())) {
            basicMetadataBuilder.addTitleBuilder().setText(htmlParseData.getTitle());
        }
        String str = null;
        if (htmlParseData.getMetaTags().containsKey("og:site_name")) {
            str = (String) htmlParseData.getMetaTags().get("og:site_name");
        } else {
            System.err.println("BlogEntry: " + page.getWebURL().getURL() + " has no page Entry");
        }
        String blogUrlForPage = blogUrlForPage(page);
        if (str != null && blogUrlForPage != null) {
            basicMetadataBuilder.setJournal(str);
            basicMetadataBuilder.setParentType(DocumentProtos.BasicMetadata.ParentType.BLOG);
            basicMetadataBuilder.addParentId(blogEntryUrlToRowId(blogUrlForPage));
            basicMetadataBuilder.addParentUrl(blogUrlForPage);
        }
        String parseAuthorName = parseAuthorName(htmlParseData, parse);
        if (parseAuthorName != null) {
            DocumentProtos.Author.Builder name = basicMetadataBuilder.addAuthorBuilder().setDocId(blogEntryUrlToRowId).setKey(blogEntryUrlToRowId + "#c-1").setName(parseAuthorName);
            if (blogUrlForPage != null) {
                if (!this.mapBlogToMapPersonId.containsKey(blogUrlForPage)) {
                    this.mapBlogToMapPersonId.put(blogUrlForPage, new ConcurrentHashMap());
                }
                Map<String, String> map = this.mapBlogToMapPersonId.get(blogUrlForPage);
                String trim = parseAuthorName.toLowerCase().trim();
                String str2 = map.get(trim);
                if (str2 == null) {
                    str2 = blogUrlForPage + "-" + this.idGen.generateIdSuffix(new String[]{trim});
                    map.put(trim, str2);
                }
                name.addExtIdBuilder().setKey("blogPersonId").setValue(str2);
            }
            if (parseAuthorName.contains(" ")) {
                String[] split = parseAuthorName.split(" ");
                name.setSurname(split[split.length - 1]);
                name.setForenames(split[0]);
            }
            int i = 1 + 1;
        }
        return newBuilder;
    }
}
