package it.unimi.di.big.mg4j.document;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.di.big.mg4j.document.PropertyBasedDocumentFactory;
import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
import it.unimi.di.big.mg4j.util.MG4JClassParser;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectArrays;
import it.unimi.dsi.fastutil.objects.ObjectBigArrayBigList;
import it.unimi.dsi.fastutil.objects.ObjectBigListIterator;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.io.SegmentedInputStream;
import it.unimi.dsi.logging.ProgressLogger;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.InvocationTargetException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;

/* loaded from: input_file:it/unimi/di/big/mg4j/document/TRECDocumentCollection.class */
public class TRECDocumentCollection extends AbstractDocumentCollection implements Serializable {
    private static final Logger LOGGER = Logger.getLogger(TRECDocumentCollection.class);
    private static final long serialVersionUID = -4251461013312968454L;
    private static final boolean DEBUG = false;
    public static final String DEFAULT_BUFFER_SIZE = "64Ki";
    private String[] file;
    private final boolean useGzip;
    protected DocumentFactory factory;
    protected transient ObjectBigArrayBigList<TRECDocumentDescriptor> descriptors;
    private final int bufferSize;
    private SegmentedInputStream lastStream;
    protected static final byte[] DOC_OPEN;
    protected static final byte[] DOC_CLOSE;
    protected static final byte[] DOCNO_OPEN;
    protected static final byte[] DOCNO_CLOSE;
    protected static final byte[] DOCHDR_OPEN;
    protected static final byte[] DOCHDR_CLOSE;
    byte[] buffer;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:it/unimi/di/big/mg4j/document/TRECDocumentCollection$TRECDocumentDescriptor.class */
    public static class TRECDocumentDescriptor implements Cloneable {
        public int fileIndex;
        public long startMarker;
        public int intermediateMarkerDiff;
        public int stopMarkerDiff;

        public TRECDocumentDescriptor(int i, long j, long j2, long j3) {
            this.fileIndex = i;
            this.startMarker = j;
            this.intermediateMarkerDiff = (int) (j2 - j);
            this.stopMarkerDiff = (int) (j3 - j);
        }

        public TRECDocumentDescriptor(int i, long j, int i2, int i3) {
            this.fileIndex = i;
            this.startMarker = j;
            this.intermediateMarkerDiff = i2;
            this.stopMarkerDiff = i3;
        }

        public final long[] toSegments() {
            return new long[]{this.startMarker, this.startMarker + this.intermediateMarkerDiff, this.stopMarkerDiff + this.startMarker};
        }

        public Object clone() {
            return new TRECDocumentDescriptor(this.fileIndex, this.startMarker, this.startMarker + this.intermediateMarkerDiff, this.stopMarkerDiff + this.startMarker);
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static boolean equals(byte[] bArr, int i, byte[] bArr2) {
        if (i != bArr2.length) {
            return false;
        }
        do {
            int i2 = i;
            i--;
            if (i2 == 0) {
                return true;
            }
        } while (bArr[i] == bArr2[i]);
        return false;
    }

    private void parseContent(int i, InputStream inputStream) throws IOException {
        boolean z = false;
        boolean z2 = false;
        LOGGER.debug("Processing file " + i + " (" + this.file[i] + ")");
        FastBufferedInputStream fastBufferedInputStream = new FastBufferedInputStream(inputStream, this.bufferSize);
        long j = 0;
        long j2 = 0;
        long j3 = 0;
        while (true) {
            int readLine = fastBufferedInputStream.readLine(this.buffer);
            if (readLine == -1) {
                fastBufferedInputStream.close();
                return;
            }
            if (readLine == this.buffer.length) {
                do {
                } while (fastBufferedInputStream.readLine(this.buffer) == this.buffer.length);
            } else {
                if (!z2 && equals(this.buffer, readLine, DOC_OPEN)) {
                    j = j3;
                    z2 = true;
                } else if (z2 && equals(this.buffer, readLine, DOC_CLOSE)) {
                    this.descriptors.add(new TRECDocumentDescriptor(i, j, j2, j3));
                    z = false;
                    z2 = false;
                } else if (z2 && !z && equals(this.buffer, readLine, DOCHDR_CLOSE)) {
                    j2 = fastBufferedInputStream.position();
                    z = true;
                }
                j3 = fastBufferedInputStream.position();
            }
        }
    }

    protected TRECDocumentCollection(String[] strArr, DocumentFactory documentFactory, ObjectBigArrayBigList<TRECDocumentDescriptor> objectBigArrayBigList, int i, boolean z) {
        this.buffer = new byte[DocumentalCluster.DEFAULT_BUFFER_SIZE];
        this.useGzip = z;
        this.file = strArr;
        this.bufferSize = i;
        this.factory = documentFactory;
        this.descriptors = objectBigArrayBigList;
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollection
    /* renamed from: copy, reason: merged with bridge method [inline-methods] */
    public TRECDocumentCollection m27copy() {
        return new TRECDocumentCollection(this.file, this.factory.m17copy(), this.descriptors, this.bufferSize, this.useGzip);
    }

    /* JADX INFO: Access modifiers changed from: private */
    public final InputStream openFileStream(String str) throws IOException {
        FileInputStream fileInputStream = new FileInputStream(str);
        return this.useGzip ? new GZIPInputStream(fileInputStream) : fileInputStream;
    }

    public TRECDocumentCollection(String[] strArr, DocumentFactory documentFactory, int i, boolean z) throws IOException {
        this.buffer = new byte[DocumentalCluster.DEFAULT_BUFFER_SIZE];
        this.file = strArr;
        this.factory = documentFactory;
        this.bufferSize = i;
        this.descriptors = new ObjectBigArrayBigList<>();
        this.useGzip = z;
        ProgressLogger progressLogger = new ProgressLogger(LOGGER);
        progressLogger.expectedUpdates = strArr.length;
        progressLogger.itemsName = "files";
        progressLogger.start("Parsing " + (z ? "GZip" : "plain") + " files");
        for (int i2 = 0; i2 < strArr.length; i2++) {
            parseContent(i2, openFileStream(strArr[i2]));
            progressLogger.update();
        }
        progressLogger.done();
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollection
    public long size() {
        return this.descriptors.size64();
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollection
    public Document document(long j) throws IOException {
        return this.factory.getDocument(stream(j), metadata(j));
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollection
    public InputStream stream(long j) throws IOException {
        ensureDocumentIndex(j);
        IOUtils.closeQuietly(this.lastStream);
        TRECDocumentDescriptor tRECDocumentDescriptor = (TRECDocumentDescriptor) this.descriptors.get(j);
        SegmentedInputStream segmentedInputStream = new SegmentedInputStream(openFileStream(this.file[tRECDocumentDescriptor.fileIndex]), tRECDocumentDescriptor.toSegments());
        this.lastStream = segmentedInputStream;
        return segmentedInputStream;
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollection
    public Reference2ObjectMap<Enum<?>, Object> metadata(long j) {
        ensureDocumentIndex(j);
        Reference2ObjectArrayMap reference2ObjectArrayMap = new Reference2ObjectArrayMap(4);
        reference2ObjectArrayMap.put(PropertyBasedDocumentFactory.MetadataKeys.URI, "Document #" + j);
        return reference2ObjectArrayMap;
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentSequence
    public DocumentFactory factory() {
        return this.factory;
    }

    @Override // it.unimi.di.big.mg4j.document.AbstractDocumentSequence, it.unimi.di.big.mg4j.document.DocumentSequence, java.io.Closeable, java.lang.AutoCloseable
    public void close() throws IOException {
        super.close();
        if (this.lastStream != null) {
            this.lastStream.close();
        }
        this.descriptors = null;
    }

    public void merge(TRECDocumentCollection tRECDocumentCollection) {
        int length = this.file.length;
        this.file = (String[]) ObjectArrays.ensureCapacity(this.file, this.file.length + tRECDocumentCollection.file.length);
        System.arraycopy(tRECDocumentCollection.file, 0, this.file, length, tRECDocumentCollection.file.length);
        ObjectBigListIterator it2 = tRECDocumentCollection.descriptors.iterator();
        while (it2.hasNext()) {
            TRECDocumentDescriptor tRECDocumentDescriptor = (TRECDocumentDescriptor) ((TRECDocumentDescriptor) it2.next()).clone();
            tRECDocumentDescriptor.fileIndex += length;
            this.descriptors.add(tRECDocumentDescriptor);
        }
    }

    @Override // it.unimi.di.big.mg4j.document.AbstractDocumentCollection, it.unimi.di.big.mg4j.document.DocumentSequence
    public DocumentIterator iterator() throws IOException {
        return new AbstractDocumentIterator() { // from class: it.unimi.di.big.mg4j.document.TRECDocumentCollection.1
            private final ObjectIterator<TRECDocumentDescriptor> descriptorIterator;
            private SegmentedInputStream siStream;
            private int currentDocument = 0;
            private Document last;
            private TRECDocumentDescriptor firstNextDescriptor;

            {
                this.descriptorIterator = TRECDocumentCollection.this.descriptors.iterator();
            }

            private boolean nextFile() throws FileNotFoundException, IOException {
                if (TRECDocumentCollection.this.size() == 0) {
                    return false;
                }
                IOUtils.closeQuietly(this.siStream);
                if (!this.descriptorIterator.hasNext()) {
                    return false;
                }
                TRECDocumentDescriptor tRECDocumentDescriptor = this.firstNextDescriptor != null ? this.firstNextDescriptor : (TRECDocumentDescriptor) this.descriptorIterator.next();
                int i = tRECDocumentDescriptor.fileIndex;
                this.siStream = new SegmentedInputStream(TRECDocumentCollection.this.openFileStream(TRECDocumentCollection.this.file[i]));
                do {
                    this.siStream.addBlock(tRECDocumentDescriptor.toSegments());
                    if (!this.descriptorIterator.hasNext()) {
                        break;
                    }
                    tRECDocumentDescriptor = (TRECDocumentDescriptor) this.descriptorIterator.next();
                } while (tRECDocumentDescriptor.fileIndex == i);
                this.firstNextDescriptor = tRECDocumentDescriptor;
                return true;
            }

            @Override // it.unimi.di.big.mg4j.document.DocumentIterator
            public Document nextDocument() throws IOException {
                if (this.last != null) {
                    this.last.close();
                    if (this.siStream.hasMoreBlocks()) {
                        this.siStream.nextBlock();
                    } else if (!nextFile()) {
                        this.last = null;
                        return null;
                    }
                } else if (!nextFile()) {
                    return null;
                }
                DocumentFactory documentFactory = TRECDocumentCollection.this.factory;
                SegmentedInputStream segmentedInputStream = this.siStream;
                TRECDocumentCollection tRECDocumentCollection = TRECDocumentCollection.this;
                int i = this.currentDocument;
                this.currentDocument = i + 1;
                Document document = documentFactory.getDocument(segmentedInputStream, tRECDocumentCollection.metadata(i));
                this.last = document;
                return document;
            }

            @Override // it.unimi.di.big.mg4j.document.AbstractDocumentIterator, it.unimi.di.big.mg4j.document.DocumentIterator, java.io.Closeable, java.lang.AutoCloseable
            public void close() throws IOException {
                if (this.siStream != null) {
                    if (this.last != null) {
                        this.last.close();
                    }
                    super.close();
                    this.siStream.close();
                    this.siStream = null;
                }
            }
        };
    }

    private void readObject(ObjectInputStream objectInputStream) throws IOException, ClassNotFoundException {
        objectInputStream.defaultReadObject();
        long readLong = objectInputStream.readLong();
        ObjectBigArrayBigList<TRECDocumentDescriptor> objectBigArrayBigList = new ObjectBigArrayBigList<>();
        objectBigArrayBigList.ensureCapacity(readLong);
        for (int i = 0; i < readLong; i++) {
            objectBigArrayBigList.add(new TRECDocumentDescriptor(objectInputStream.readInt(), objectInputStream.readLong(), objectInputStream.readInt(), objectInputStream.readInt()));
        }
        this.descriptors = objectBigArrayBigList;
    }

    private void writeObject(ObjectOutputStream objectOutputStream) throws IOException {
        objectOutputStream.defaultWriteObject();
        objectOutputStream.writeLong(this.descriptors.size64());
        Iterator it2 = this.descriptors.iterator();
        while (it2.hasNext()) {
            TRECDocumentDescriptor tRECDocumentDescriptor = (TRECDocumentDescriptor) it2.next();
            objectOutputStream.writeInt(tRECDocumentDescriptor.fileIndex);
            objectOutputStream.writeLong(tRECDocumentDescriptor.startMarker);
            objectOutputStream.writeInt(tRECDocumentDescriptor.intermediateMarkerDiff);
            objectOutputStream.writeInt(tRECDocumentDescriptor.stopMarkerDiff);
        }
    }

    public static void main(String[] strArr) throws IOException, JSAPException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
        SimpleJSAP simpleJSAP = new SimpleJSAP(TRECDocumentCollection.class.getName(), "Saves a serialised TREC document collection based on a set of file names (which will be sorted lexicographically).", new Parameter[]{new FlaggedOption("factory", MG4JClassParser.getParser(), IdentityDocumentFactory.class.getName(), false, 'f', "factory", "A document factory with a standard constructor."), new FlaggedOption("property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'p', "property", "A 'key=value' specification, or the name of a property file").setAllowMultipleDeclarations(true), new Switch("gzipped", 'z', "gzipped", "The files are gzipped."), new Switch("unsorted", 'u', "unsorted", "Keep the file list unsorted."), new FlaggedOption("bufferSize", JSAP.INTSIZE_PARSER, DEFAULT_BUFFER_SIZE, false, 'b', "buffer-size", "The size of an I/O buffer."), new UnflaggedOption("collection", JSAP.STRING_PARSER, true, "The filename for the serialised collection."), new UnflaggedOption("file", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, true, "A list of files that will be indexed. If missing, a list of files will be read from standard input.")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            return;
        }
        PropertyBasedDocumentFactory propertyBasedDocumentFactory = PropertyBasedDocumentFactory.getInstance((Class<?>) parse.getClass("factory"), parse.getStringArray("property"));
        String[] stringArray = parse.getStringArray("file");
        if (stringArray.length == 0) {
            ObjectArrayList objectArrayList = new ObjectArrayList();
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(System.in));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                } else {
                    objectArrayList.add(readLine);
                }
            }
            stringArray = (String[]) objectArrayList.toArray(new String[0]);
        }
        if (!parse.getBoolean("unsorted")) {
            Arrays.sort(stringArray);
        }
        DocumentFactory factory = CompositeDocumentFactory.getFactory(new TRECHeaderDocumentFactory(), propertyBasedDocumentFactory);
        if (stringArray.length == 0) {
            System.err.println("WARNING: empty file set.");
        }
        BinIO.storeObject(new TRECDocumentCollection(stringArray, factory, parse.getInt("bufferSize"), parse.getBoolean("gzipped")), parse.getString("collection"));
    }

    static {
        try {
            DOC_OPEN = "<DOC>".getBytes("ASCII");
            DOC_CLOSE = "</DOC>".getBytes("ASCII");
            DOCNO_OPEN = "<DOCNO>".getBytes("ASCII");
            DOCNO_CLOSE = "</DOCNO>".getBytes("ASCII");
            DOCHDR_OPEN = "<DOCHDR>".getBytes("ASCII");
            DOCHDR_CLOSE = "</DOCHDR>".getBytes("ASCII");
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e);
        }
    }
}
