package liner2.chunker;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import liner2.Main;
import liner2.structure.Annotation;
import liner2.structure.AnnotationSet;
import liner2.structure.Document;
import liner2.structure.Paragraph;
import liner2.structure.Sentence;
import liner2.structure.Token;
import org.chasen.crfpp.Tagger;
import weka.core.TestInstances;

/* loaded from: input_file:liner2/chunker/CrfppChunker.class */
public class CrfppChunker extends Chunker implements TrainableChunkerInterface, DeserializableChunkerInterface, SerializableChunkerInterface {
    private File trainingFile;
    private PrintWriter trainingFileWriter;
    private Tagger tagger;
    private Pattern p;
    private String template_filename;
    private String model_filename;
    private int threads;
    private static final int MAX_TOKENS = 1000;
    private HashSet<String> types;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:liner2/chunker/CrfppChunker$CRFcmd.class */
    public class CRFcmd {
        public String file_template = "";
        public String file_iob = "";
        public String file_model = "";
        public int threads = 1;

        CRFcmd() {
        }

        public String get_crf_learn() {
            return String.format("crf_learn %s %s %s -f 5 -c 1 -p %d", this.file_template, this.file_iob, this.file_model, Integer.valueOf(this.threads));
        }
    }

    public CrfppChunker() {
        this.trainingFile = null;
        this.trainingFileWriter = null;
        this.tagger = null;
        this.p = Pattern.compile("([IB])-(.*)");
        this.template_filename = null;
        this.model_filename = null;
        this.threads = 1;
        this.types = null;
        this.types = new HashSet<>();
    }

    public CrfppChunker(int i, HashSet<String> hashSet) {
        this.trainingFile = null;
        this.trainingFileWriter = null;
        this.tagger = null;
        this.p = Pattern.compile("([IB])-(.*)");
        this.template_filename = null;
        this.model_filename = null;
        this.threads = 1;
        this.types = null;
        this.threads = i;
        this.types = hashSet;
    }

    private synchronized AnnotationSet chunkSentence(Sentence sentence) {
        if (sentence.getTokenNumber() > 1000) {
            return new AnnotationSet(sentence);
        }
        sendDataToTagger(sentence);
        return readTaggerOutput(sentence);
    }

    private void sendDataToTagger(Sentence sentence) {
        this.tagger.clear();
        int attributeIndexLength = sentence.getAttributeIndexLength();
        Iterator<Token> it = sentence.getTokens().iterator();
        while (it.hasNext()) {
            Token next = it.next();
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < attributeIndexLength; i++) {
                sb.append(TestInstances.DEFAULT_SEPARATORS);
                String attributeValue = next.getAttributeValue(i);
                if (attributeValue != null) {
                    String replaceAll = attributeValue.replaceAll("\\s+", "_");
                    attributeValue = replaceAll.length() == 0 ? "NULL" : replaceAll;
                }
                sb.append(attributeValue);
            }
            this.tagger.add(sb.toString().trim());
        }
        this.tagger.parse();
    }

    private AnnotationSet readTaggerOutput(Sentence sentence) {
        AnnotationSet annotationSet = new AnnotationSet(sentence);
        String str = null;
        int i = 0;
        for (int i2 = 0; i2 < this.tagger.size(); i2++) {
            Matcher matcher = this.p.matcher(this.tagger.y2(i2));
            if (str != null && (!matcher.matches() || matcher.group(1).equals("B"))) {
                annotationSet.addChunk(new Annotation(i, i2 - 1, str, sentence));
                str = null;
                i = 0;
            }
            if (matcher.matches() && matcher.group(1).toString().equals("B")) {
                i = i2;
                str = matcher.group(2);
            }
        }
        if (str != null) {
            annotationSet.addChunk(new Annotation(i, ((int) this.tagger.size()) - 1, str, sentence));
        }
        return annotationSet;
    }

    @Override // liner2.chunker.TrainableChunkerInterface
    public void train() throws Exception {
        this.trainingFileWriter.close();
        compileTagger();
    }

    @Override // liner2.chunker.TrainableChunkerInterface
    public void addTrainingData(Document document) {
        if (this.trainingFileWriter == null) {
            try {
                this.trainingFile = new File("crf_iob.txt");
                this.trainingFileWriter = new PrintWriter(this.trainingFile);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        Iterator<Paragraph> it = document.getParagraphs().iterator();
        while (it.hasNext()) {
            Iterator<Sentence> it2 = it.next().getSentences().iterator();
            while (it2.hasNext()) {
                Sentence next = it2.next();
                int attributeIndexLength = next.getAttributeIndexLength();
                ArrayList<Token> tokens = next.getTokens();
                int i = 0;
                while (i < tokens.size()) {
                    String str = "";
                    for (int i2 = 0; i2 < attributeIndexLength; i2++) {
                        String attributeValue = tokens.get(i).getAttributeValue(i2);
                        if (attributeValue != null) {
                            String replaceAll = attributeValue.replaceAll("\\s+", "_");
                            attributeValue = replaceAll.length() == 0 ? "NULL" : replaceAll;
                        }
                        str = str + TestInstances.DEFAULT_SEPARATORS + attributeValue;
                    }
                    Annotation chunkAt = this.types.size() == 0 ? next.getChunkAt(i) : next.getChunkAt(i, this.types);
                    this.trainingFileWriter.write((chunkAt == null ? str + " O" : (chunkAt.getBegin() == i ? str + " B-" : str + " I-") + chunkAt.getType()).trim() + "\n");
                    i++;
                }
                this.trainingFileWriter.write("\n");
            }
        }
        this.trainingFileWriter.flush();
    }

    private void compileTagger() {
        this.trainingFileWriter.close();
        CRFcmd cRFcmd = new CRFcmd();
        cRFcmd.file_template = this.template_filename;
        cRFcmd.file_model = this.model_filename;
        cRFcmd.file_iob = this.trainingFile.getAbsolutePath();
        cRFcmd.threads = this.threads;
        try {
            Process exec = Runtime.getRuntime().exec(cRFcmd.get_crf_learn());
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(exec.getInputStream()));
            BufferedReader bufferedReader2 = new BufferedReader(new InputStreamReader(exec.getErrorStream()));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                } else {
                    Main.log(readLine);
                }
            }
            boolean z = false;
            while (true) {
                String readLine2 = bufferedReader2.readLine();
                if (readLine2 == null) {
                    break;
                }
                System.out.println(">> Error: " + readLine2);
                z = true;
            }
            if (z) {
                throw new Error("There was a problem with training CRF");
            }
            deserialize(cRFcmd.file_model);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    @Override // liner2.chunker.DeserializableChunkerInterface
    public void deserialize(String str) {
        this.tagger = new Tagger(String.format("-m %s -v 3 -n 64", str));
    }

    @Override // liner2.chunker.SerializableChunkerInterface
    public void serialize(String str) {
    }

    @Override // liner2.chunker.Chunker
    public void close() {
        this.tagger.delete();
    }

    public void setTemplateFilename(String str) {
        this.template_filename = str;
    }

    public void setModelFilename(String str) {
        this.model_filename = str;
    }

    @Override // liner2.chunker.Chunker
    public HashMap<Sentence, AnnotationSet> chunk(Document document) {
        HashMap<Sentence, AnnotationSet> hashMap = new HashMap<>();
        Iterator<Paragraph> it = document.getParagraphs().iterator();
        while (it.hasNext()) {
            Iterator<Sentence> it2 = it.next().getSentences().iterator();
            while (it2.hasNext()) {
                Sentence next = it2.next();
                hashMap.put(next, chunkSentence(next));
            }
        }
        return hashMap;
    }
}
