package it.unimi.dsi.law.warc.parser;

import com.google.common.base.Charsets;
import it.unimi.dsi.fastutil.io.MeasurableInputStream;
import it.unimi.dsi.fastutil.objects.ObjectLinkedOpenHashSet;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import it.unimi.dsi.law.bubing.util.BURL;
import it.unimi.dsi.law.warc.io.InspectableBufferedInputStream;
import it.unimi.dsi.law.warc.parser.Parser;
import it.unimi.dsi.law.warc.util.ByteArrayCharSequence;
import it.unimi.dsi.law.warc.util.HttpResponse;
import it.unimi.dsi.law.warc.util.Response;
import it.unimi.dsi.law.warc.util.Util;
import it.unimi.dsi.util.TextPattern;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.htmlparser.jericho.CharacterReference;
import net.htmlparser.jericho.EndTag;
import net.htmlparser.jericho.EndTagType;
import net.htmlparser.jericho.HTMLElements;
import net.htmlparser.jericho.Segment;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;
import net.htmlparser.jericho.StreamedSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:it/unimi/dsi/law/warc/parser/HTMLParser.class */
public class HTMLParser implements Parser {
    public static final int CHAR_BUFFER_SIZE = 65536;
    public final char[] buffer;
    private String guessedCharset;
    private MessageDigest messageDigest;
    private final DigestAppendable digestAppendable;
    private URI location;
    private URI metaLocation;
    private static final Logger LOGGER = LoggerFactory.getLogger(HTMLParser.class);
    private static final TextPattern URLEQUAL_PATTERN = new TextPattern("URL=", 1);
    private static final TextPattern META_PATTERN = new TextPattern("<meta", 1);
    private static final Pattern HTTP_EQUIV_PATTERN = Pattern.compile(".*http-equiv\\s*=\\s*('|\")?content-type('|\")?.*", 2);
    private static final Pattern CONTENT_PATTERN = Pattern.compile(".*content\\s*=\\s*('|\")([^'\"]*)('|\").*", 2);
    private static final Pattern CHARSET_PATTERN = Pattern.compile(".*charset\\s*=\\s*(([\\041-\\0176&&[^<>\\{\\}\\\\/:,;@?=]])+|\"[^\"]*\").*", 2);

    /* loaded from: input_file:it/unimi/dsi/law/warc/parser/HTMLParser$DigestAppendable.class */
    private static final class DigestAppendable implements Appendable {
        private static final int BYTE_BUFFER_SIZE = 1024;
        private static final Reference2ObjectOpenHashMap<String, byte[]> startTags;
        private static final Reference2ObjectOpenHashMap<String, byte[]> endTags;
        private final byte[] byteBuffer = new byte[BYTE_BUFFER_SIZE];
        private final MessageDigest digester;
        private int fill;

        public DigestAppendable(MessageDigest messageDigest) {
            this.digester = messageDigest;
        }

        public void init(URI uri) {
            this.digester.reset();
            this.fill = 0;
            if (uri != null) {
                append(uri.getAuthority());
                append((char) 0);
            }
        }

        @Override // java.lang.Appendable
        public Appendable append(CharSequence charSequence, int i, int i2) {
            for (int i3 = i; i3 < i2; i3++) {
                char charAt = charSequence.charAt(i3);
                if (this.fill >= 1023) {
                    this.digester.update(this.byteBuffer, 0, this.fill);
                    this.fill = 0;
                }
                byte[] bArr = this.byteBuffer;
                int i4 = this.fill;
                this.fill = i4 + 1;
                bArr[i4] = (byte) (charAt >> '\b');
                byte[] bArr2 = this.byteBuffer;
                int i5 = this.fill;
                this.fill = i5 + 1;
                bArr2[i5] = (byte) charAt;
            }
            return this;
        }

        @Override // java.lang.Appendable
        public Appendable append(char c) {
            if (this.fill >= 1023) {
                this.digester.update(this.byteBuffer, 0, this.fill);
                this.fill = 0;
            }
            byte[] bArr = this.byteBuffer;
            int i = this.fill;
            this.fill = i + 1;
            bArr[i] = (byte) (c >> '\b');
            byte[] bArr2 = this.byteBuffer;
            int i2 = this.fill;
            this.fill = i2 + 1;
            bArr2[i2] = (byte) c;
            return this;
        }

        @Override // java.lang.Appendable
        public Appendable append(CharSequence charSequence) {
            return append(charSequence, 0, charSequence.length());
        }

        public byte[] digest() {
            this.digester.update(this.byteBuffer, 0, this.fill);
            this.fill = 0;
            return this.digester.digest();
        }

        private void update(byte[] bArr) {
            for (byte b : bArr) {
                if (this.fill == BYTE_BUFFER_SIZE) {
                    this.digester.update(this.byteBuffer);
                    this.fill = 0;
                }
                byte[] bArr2 = this.byteBuffer;
                int i = this.fill;
                this.fill = i + 1;
                bArr2[i] = b;
            }
        }

        public void startTag(StartTag startTag) {
            String attributeValue;
            String name = startTag.getName();
            update((byte[]) startTags.get(name));
            if ((name == "iframe" || name == "frame") && (attributeValue = startTag.getAttributeValue("src")) != null) {
                append('\"');
                append(attributeValue);
                append('\"');
            }
        }

        public void endTag(EndTag endTag) {
            update((byte[]) endTags.get(endTag.getName()));
        }

        static {
            List<String> elementNames = HTMLElements.getElementNames();
            startTags = new Reference2ObjectOpenHashMap<>(elementNames.size());
            endTags = new Reference2ObjectOpenHashMap<>(elementNames.size());
            startTags.defaultReturnValue(Util.getASCIIBytes("<unknown>"));
            endTags.defaultReturnValue(Util.getASCIIBytes("</unknown>"));
            for (String str : elementNames) {
                startTags.put(str, Util.getASCIIBytes("<" + str + ">"));
                endTags.put(str, Util.getASCIIBytes("</" + str + ">"));
            }
        }
    }

    /* loaded from: input_file:it/unimi/dsi/law/warc/parser/HTMLParser$SetLinkReceiver.class */
    public static final class SetLinkReceiver implements Parser.LinkReceiver {
        private final Set<URI> urls = new ObjectLinkedOpenHashSet();

        @Override // it.unimi.dsi.law.warc.parser.Parser.LinkReceiver
        public void location(URI uri) {
            this.urls.add(uri);
        }

        @Override // it.unimi.dsi.law.warc.parser.Parser.LinkReceiver
        public void metaLocation(URI uri) {
            this.urls.add(uri);
        }

        @Override // it.unimi.dsi.law.warc.parser.Parser.LinkReceiver
        public void metaRefresh(URI uri) {
            this.urls.add(uri);
        }

        @Override // it.unimi.dsi.law.warc.parser.Parser.LinkReceiver
        public void link(URI uri) {
            this.urls.add(uri);
        }

        @Override // it.unimi.dsi.law.warc.parser.Parser.LinkReceiver
        public void init(URI uri) {
            this.urls.clear();
        }

        @Override // java.lang.Iterable
        public Iterator<URI> iterator() {
            return this.urls.iterator();
        }
    }

    public HTMLParser(MessageDigest messageDigest) {
        this.buffer = new char[65536];
        this.messageDigest = messageDigest;
        this.digestAppendable = messageDigest == null ? null : new DigestAppendable(messageDigest);
    }

    public HTMLParser(String str) throws NoSuchAlgorithmException {
        this(MessageDigest.getInstance(str));
    }

    public HTMLParser() {
        this((MessageDigest) null);
    }

    private void process(Parser.LinkReceiver linkReceiver, URI uri, String str) {
        URI parse;
        if (str == null || (parse = BURL.parse(str)) == null) {
            return;
        }
        linkReceiver.link(uri.resolve(parse));
    }

    @Override // it.unimi.dsi.law.warc.parser.Parser
    public byte[] parse(Response response, Parser.LinkReceiver linkReceiver) throws IOException {
        URI parse;
        URI parse2;
        int search;
        String substring;
        URI parse3;
        URI parse4;
        String charsetNameFromHeader;
        URI uri = response.uri();
        HttpResponse httpResponse = (HttpResponse) response;
        this.guessedCharset = "ISO-8859-1";
        String str = httpResponse.headers().get("Content-Type");
        if (str != null && (charsetNameFromHeader = getCharsetNameFromHeader(str)) != null) {
            this.guessedCharset = charsetNameFromHeader;
        }
        MeasurableInputStream contentAsStream = httpResponse.contentAsStream();
        if (contentAsStream instanceof InspectableBufferedInputStream) {
            InspectableBufferedInputStream inspectableBufferedInputStream = (InspectableBufferedInputStream) contentAsStream;
            String charsetName = getCharsetName(inspectableBufferedInputStream.buffer, inspectableBufferedInputStream.inspectable);
            if (charsetName != null) {
                this.guessedCharset = charsetName;
            }
        }
        LOGGER.debug("Guessing charset " + this.guessedCharset + " for URL " + uri);
        Charset charset = Charsets.ISO_8859_1;
        try {
            charset = Charset.forName(this.guessedCharset);
        } catch (IllegalCharsetNameException e) {
            LOGGER.warn("Response for " + uri + " contained an illegal charset name: " + this.guessedCharset);
        } catch (UnsupportedCharsetException e2) {
            LOGGER.warn("Response for " + uri + " contained an unsupported charset: " + this.guessedCharset);
        }
        linkReceiver.init(uri);
        this.location = null;
        this.metaLocation = null;
        if (httpResponse.headers().get("Location") != null && (parse4 = BURL.parse(httpResponse.headers().get("Location"))) != null) {
            if (!parse4.isAbsolute()) {
                LOGGER.warn("Found relative header location URL: \"" + parse4 + "\"");
            }
            URI resolve = uri.resolve(parse4);
            this.location = resolve;
            linkReceiver.location(resolve);
        }
        StreamedSource streamedSource = new StreamedSource(new InputStreamReader((InputStream) contentAsStream, charset));
        streamedSource.setBuffer(this.buffer);
        if (this.digestAppendable != null) {
            this.digestAppendable.init(uri);
        }
        URI uri2 = uri;
        int i = 0;
        int i2 = 0;
        Iterator it2 = streamedSource.iterator();
        while (it2.hasNext()) {
            CharSequence charSequence = (Segment) it2.next();
            if (charSequence.getEnd() > i) {
                i = charSequence.getEnd();
                if (charSequence instanceof StartTag) {
                    StartTag startTag = (StartTag) charSequence;
                    if (startTag.getTagType() == StartTagType.NORMAL) {
                        String name = startTag.getName();
                        if (name == "style" || name == "script") {
                            i2++;
                        }
                        if (this.digestAppendable != null) {
                            this.digestAppendable.startTag(startTag);
                        }
                        if (linkReceiver != null) {
                            if (name == "iframe" || name == "frame" || name == "embed") {
                                process(linkReceiver, uri2, startTag.getAttributeValue("src"));
                            } else if (name == "img" || name == "script") {
                                process(linkReceiver, uri2, startTag.getAttributeValue("src"));
                            } else if (name == "object") {
                                process(linkReceiver, uri2, startTag.getAttributeValue("data"));
                            } else if (name == "a" || name == "area" || name == "link") {
                                process(linkReceiver, uri2, startTag.getAttributeValue("href"));
                            } else if (name == "base") {
                                String attributeValue = startTag.getAttributeValue("href");
                                if (attributeValue != null && (parse = BURL.parse(attributeValue)) != null) {
                                    if (parse.isAbsolute()) {
                                        uri2 = parse;
                                    } else {
                                        LOGGER.warn("Found relative BASE URL: \"" + parse + "\"");
                                    }
                                }
                            } else if (name == "meta") {
                                String attributeValue2 = startTag.getAttributeValue("http-equiv");
                                String attributeValue3 = startTag.getAttributeValue("content");
                                if (attributeValue2 != null && attributeValue3 != null) {
                                    attributeValue2.toLowerCase();
                                    if (attributeValue2.equals("refresh") && (search = URLEQUAL_PATTERN.search(attributeValue3)) != -1 && (parse3 = BURL.parse((substring = attributeValue3.substring(search + URLEQUAL_PATTERN.length())))) != null) {
                                        if (!parse3.isAbsolute()) {
                                            LOGGER.warn("Found relative META refresh URL: \"" + substring + "\"");
                                        }
                                        linkReceiver.metaRefresh(uri2.resolve(parse3));
                                    }
                                    if (attributeValue2.equals("location") && (parse2 = BURL.parse(attributeValue3)) != null) {
                                        if (!parse2.isAbsolute()) {
                                            LOGGER.warn("Found relative META location URL: \"" + attributeValue3 + "\"");
                                        }
                                        URI resolve2 = uri2.resolve(parse2);
                                        this.metaLocation = resolve2;
                                        linkReceiver.metaLocation(resolve2);
                                    }
                                }
                            }
                        }
                    }
                } else if (charSequence instanceof EndTag) {
                    EndTag endTag = (EndTag) charSequence;
                    String name2 = endTag.getName();
                    if (name2 == "style" || name2 == "script") {
                        i2--;
                    }
                    if (this.digestAppendable != null && endTag.getTagType() == EndTagType.NORMAL) {
                        this.digestAppendable.endTag(endTag);
                    }
                } else if (this.digestAppendable != null && i2 == 0) {
                    if (charSequence instanceof CharacterReference) {
                        ((CharacterReference) charSequence).appendCharTo(this.digestAppendable);
                    } else {
                        this.digestAppendable.append(charSequence);
                    }
                }
            }
        }
        if (this.digestAppendable != null) {
            return this.digestAppendable.digest();
        }
        return null;
    }

    @Override // it.unimi.dsi.law.warc.parser.Parser
    public String guessedCharset() {
        return this.guessedCharset;
    }

    public URI location() {
        if (this.location != null) {
            return this.location;
        }
        if (this.metaLocation != null) {
            return this.metaLocation;
        }
        return null;
    }

    public static String getCharsetName(byte[] bArr, int i) {
        int i2 = 0;
        while (true) {
            int search = META_PATTERN.search(bArr, i2, i);
            if (search == -1) {
                return null;
            }
            int i3 = search;
            while (i3 < i && bArr[i3] != 62) {
                i3++;
            }
            if (i3 == i) {
                return null;
            }
            ByteArrayCharSequence byteArrayCharSequence = new ByteArrayCharSequence(bArr, search + META_PATTERN.length(), (i3 - search) - META_PATTERN.length());
            if (HTTP_EQUIV_PATTERN.matcher(byteArrayCharSequence).matches()) {
                Matcher matcher = CONTENT_PATTERN.matcher(byteArrayCharSequence);
                if (matcher.matches()) {
                    return getCharsetNameFromHeader(matcher.group(2));
                }
            }
            i2 = i3 + 1;
        }
    }

    public static String getCharsetNameFromHeader(String str) {
        Matcher matcher = CHARSET_PATTERN.matcher(str);
        if (!matcher.matches()) {
            return null;
        }
        String group = matcher.group(1);
        int i = 0;
        int length = group.length();
        if (length > 0 && (group.charAt(0) == '\"' || group.charAt(0) == '\'')) {
            i = 1;
        }
        if (length > 0 && (group.charAt(length - 1) == '\"' || group.charAt(length - 1) == '\'')) {
            length--;
        }
        if (i < length) {
            return group.substring(i, length);
        }
        return null;
    }

    public boolean apply(Response response) {
        String str;
        return (response instanceof HttpResponse) && (str = ((HttpResponse) response).headers().get("Content-Type")) != null && str.startsWith("text/");
    }

    public Object clone() {
        return new HTMLParser(this.messageDigest);
    }
}
