package pl.edu.icm.cermine.content.transformers;

import com.itextpdf.text.html.HtmlTags;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import pl.edu.icm.cermine.content.model.DocumentContentStructure;
import pl.edu.icm.cermine.content.model.DocumentHeader;
import pl.edu.icm.cermine.content.model.DocumentParagraph;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.tools.transformers.FormatToModelReader;

/* loaded from: input_file:WEB-INF/lib/cermine-impl-1.5-SNAPSHOT.jar:pl/edu/icm/cermine/content/transformers/HTMLToDocContentStructReader.class */
public class HTMLToDocContentStructReader implements FormatToModelReader<DocumentContentStructure> {
    /* JADX WARN: Can't rename method to resolve collision */
    @Override // pl.edu.icm.cermine.tools.transformers.FormatToModelReader
    public DocumentContentStructure read(String str, Object... objArr) throws TransformationException {
        return read((Reader) new StringReader(str), objArr);
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // pl.edu.icm.cermine.tools.transformers.FormatToModelReader
    public DocumentContentStructure read(Reader reader, Object... objArr) throws TransformationException {
        try {
            DocumentContentStructure createDocContentStruct = createDocContentStruct(getRoot(reader).getChildren(), 0);
            createDocContentStruct.setParents();
            return createDocContentStruct;
        } catch (IOException e) {
            throw new TransformationException(e);
        } catch (JDOMException e2) {
            throw new TransformationException(e2);
        }
    }

    private Element getRoot(Reader reader) throws JDOMException, IOException {
        return new SAXBuilder("org.apache.xerces.parsers.SAXParser").build(reader).getRootElement();
    }

    private DocumentContentStructure createDocContentStruct(List<Element> list, int i) {
        DocumentContentStructure documentContentStructure = new DocumentContentStructure();
        if (list.isEmpty()) {
            return documentContentStructure;
        }
        int i2 = 0;
        Element element = list.get(0);
        if (i > 0 && isHeader(element)) {
            documentContentStructure.setHeader(new DocumentHeader(i, element.getValue(), documentContentStructure));
            i2 = 0 + 1;
            element = getNext(i2, list);
        }
        while (element != null && isParagraph(element)) {
            documentContentStructure.addParagraph(new DocumentParagraph(element.getValue(), documentContentStructure));
            i2++;
            element = getNext(i2, list);
        }
        if (element == null) {
            return documentContentStructure;
        }
        int headerLevel = getHeaderLevel(element);
        ArrayList arrayList = new ArrayList();
        while (element != null) {
            if (isHeader(element) && headerLevel == getHeaderLevel(element) && !arrayList.isEmpty()) {
                documentContentStructure.addPart(createDocContentStruct(arrayList, i + 1));
                arrayList.clear();
            }
            arrayList.add(element);
            i2++;
            element = getNext(i2, list);
        }
        if (!arrayList.isEmpty()) {
            documentContentStructure.addPart(createDocContentStruct(arrayList, i + 1));
        }
        return documentContentStructure;
    }

    private boolean isHeader(Element element) {
        return element.getName().toLowerCase().startsWith(WikipediaTokenizer.HEADING);
    }

    private boolean isParagraph(Element element) {
        return element.getName().equals(HtmlTags.P);
    }

    private int getHeaderLevel(Element element) {
        return Integer.parseInt(element.getName().replaceAll("[^0-9]+", ""));
    }

    private Element getNext(int i, List<Element> list) {
        if (i < list.size()) {
            return list.get(i);
        }
        return null;
    }

    @Override // pl.edu.icm.cermine.tools.transformers.FormatToModelReader
    public List<DocumentContentStructure> readAll(String str, Object... objArr) throws TransformationException {
        throw new UnsupportedOperationException("Not supported yet.");
    }

    @Override // pl.edu.icm.cermine.tools.transformers.FormatToModelReader
    public List<DocumentContentStructure> readAll(Reader reader, Object... objArr) throws TransformationException {
        throw new UnsupportedOperationException("Not supported yet.");
    }
}
