package pl.edu.icm.coansys.kwdextraction.utils;

import java.io.DataInputStream;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Locale;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;

/* loaded from: input_file:pl/edu/icm/coansys/kwdextraction/utils/SentenceInputFormat.class */
public class SentenceInputFormat extends FileInputFormat<IntStringPair, IntStringPair> {

    /* loaded from: input_file:pl/edu/icm/coansys/kwdextraction/utils/SentenceInputFormat$SentenceRecordReader.class */
    private static class SentenceRecordReader implements RecordReader<IntStringPair, IntStringPair> {
        private Path file;
        private int wordCount = 0;
        private int currentWordCount = 0;
        private ArrayList<String> content;
        private DataInputStream in;
        private BreakIterator sentenceIterator;
        private int sentenceStart;
        private int sentenceEnd;
        private int cur;

        public SentenceRecordReader(InputSplit inputSplit, JobConf jobConf) throws IOException {
            FileSplit fileSplit = (FileSplit) inputSplit;
            this.file = fileSplit.getPath();
            this.in = this.file.getFileSystem(jobConf).open(fileSplit.getPath());
            this.content = new XmlParser().getText(this.in.readUTF());
            for (int i = 0; i < this.content.size(); i++) {
                this.wordCount += this.content.get(i).split("\\s").length;
            }
            this.sentenceIterator = BreakIterator.getSentenceInstance(new Locale("en"));
            this.sentenceIterator.setText(this.content.get(0));
            this.sentenceStart = 0;
            this.sentenceEnd = this.sentenceIterator.next();
            this.cur = 0;
        }

        public void close() throws IOException {
            if (this.in != null) {
                this.in.close();
                this.in = null;
            }
        }

        /* renamed from: createKey, reason: merged with bridge method [inline-methods] */
        public IntStringPair m2createKey() {
            return new IntStringPair();
        }

        /* renamed from: createValue, reason: merged with bridge method [inline-methods] */
        public IntStringPair m1createValue() {
            return new IntStringPair();
        }

        public long getPos() throws IOException {
            return this.currentWordCount;
        }

        public float getProgress() throws IOException {
            if (this.wordCount == 0) {
                return 0.0f;
            }
            return this.currentWordCount / this.wordCount;
        }

        public boolean next(IntStringPair intStringPair, IntStringPair intStringPair2) throws IOException {
            if (this.sentenceEnd == -1) {
                this.cur++;
                if (this.cur == this.content.size()) {
                    return false;
                }
                this.sentenceIterator.setText(this.content.get(this.cur));
                this.sentenceStart = 0;
                this.sentenceEnd = this.sentenceIterator.next();
            }
            String substring = this.content.get(this.cur).substring(this.sentenceStart, this.sentenceEnd);
            intStringPair2.set(Integer.valueOf(this.wordCount), substring);
            intStringPair.set(Integer.valueOf(this.currentWordCount), this.file.toString());
            this.currentWordCount += substring.split("\\s").length;
            this.sentenceEnd = this.sentenceIterator.next();
            return true;
        }
    }

    public RecordReader<IntStringPair, IntStringPair> getRecordReader(InputSplit inputSplit, JobConf jobConf, Reporter reporter) throws IOException {
        return new SentenceRecordReader(inputSplit, jobConf);
    }
}
