package edu.umn.biomedicus.acronym;

import edu.umn.biomedicus.acronym.AcronymExpansionsModel;
import edu.umn.biomedicus.exc.BiomedicusException;
import edu.umn.biomedicus.tokenization.Token;
import edu.umn.nlpengine.AbstractTextRange;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.FileVisitOption;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.annotation.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/*  JADX ERROR: NullPointerException in pass: ClassModifier
    java.lang.NullPointerException: Cannot invoke "java.util.List.forEach(java.util.function.Consumer)" because "blocks" is null
    	at jadx.core.utils.BlockUtils.collectAllInsns(BlockUtils.java:1017)
    	at jadx.core.dex.visitors.ClassModifier.removeBridgeMethod(ClassModifier.java:239)
    	at jadx.core.dex.visitors.ClassModifier.removeSyntheticMethods(ClassModifier.java:154)
    	at java.base/java.util.ArrayList.forEach(ArrayList.java:1596)
    	at jadx.core.dex.visitors.ClassModifier.visit(ClassModifier.java:64)
    */
/* loaded from: input_file:edu/umn/biomedicus/acronym/AcronymVectorOfflineTrainer.class */
public class AcronymVectorOfflineTrainer {
    public static final int DEFAULT_N_WORDS = 100000;
    private static final Logger LOGGER;
    private static final String TEXTBREAK = "[^\\w\\-/]+";
    private static final Pattern initialJunk;
    private static final Pattern finalJunk;
    private static final long maxBytesToCountWords = 5000000000L;
    final AcronymExpansionsModel aem;
    private final Map<String, Set<String>> alternateFormOf;
    private final int nWords;

    @Nullable
    WordVectorSpace vectorSpace;

    @Nullable
    private Map<String, SparseVector> senseVectors;

    @Nullable
    private Map<String, Integer> wordFrequency;

    @Nullable
    private PhraseGraph phraseGraph;
    static final /* synthetic */ boolean $assertionsDisabled;
    private boolean ignoreDoubleAlternates = false;
    private long bytesWordCounted = 0;
    private long total = 0;
    private long visited = 0;

    /* loaded from: input_file:edu/umn/biomedicus/acronym/AcronymVectorOfflineTrainer$ByValue.class */
    public class ByValue<K extends Comparable<K>, V extends Comparable<V>> implements Comparator<K> {
        private Map<K, V> map;

        public ByValue(Map<K, V> map) {
            this.map = map;
        }

        @Override // java.util.Comparator
        public int compare(K k, K k2) {
            V v = this.map.get(k);
            V v2 = this.map.get(k2);
            if (v == v2) {
                return 0;
            }
            int compareTo = v.compareTo(v2);
            return compareTo != 0 ? compareTo : k.compareTo(k2);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/umn/biomedicus/acronym/AcronymVectorOfflineTrainer$DummyToken.class */
    public class DummyToken extends AbstractTextRange implements Token {
        private String text;

        DummyToken(String str) {
            this.text = str;
        }

        @Override // edu.umn.biomedicus.tokenization.Token
        public String getText() {
            return this.text;
        }

        @Override // edu.umn.biomedicus.tokenization.Token
        public boolean getHasSpaceAfter() {
            return true;
        }

        public int getStartIndex() {
            return 0;
        }

        public int getEndIndex() {
            return 0;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/umn/biomedicus/acronym/AcronymVectorOfflineTrainer$FileVectorizer.class */
    public class FileVectorizer extends SimpleFileVisitor<Path> {
        private boolean vectorizeNotCount;

        FileVectorizer(boolean z) {
            this.vectorizeNotCount = z;
        }

        /*  JADX ERROR: JadxRuntimeException in pass: InlineMethods
            jadx.core.utils.exceptions.JadxRuntimeException: Failed to process method for inline: edu.umn.biomedicus.acronym.AcronymVectorOfflineTrainer.access$508(edu.umn.biomedicus.acronym.AcronymVectorOfflineTrainer):long
            	at jadx.core.dex.visitors.InlineMethods.processInvokeInsn(InlineMethods.java:74)
            	at jadx.core.dex.visitors.InlineMethods.visit(InlineMethods.java:49)
            Caused by: jadx.core.utils.exceptions.JadxRuntimeException: Class not yet loaded at codegen stage: edu.umn.biomedicus.acronym.AcronymVectorOfflineTrainer
            	at jadx.core.dex.nodes.ClassNode.reloadAtCodegenStage(ClassNode.java:883)
            	at jadx.core.dex.visitors.InlineMethods.processInvokeInsn(InlineMethods.java:66)
            	... 1 more
            */
        @Override // java.nio.file.SimpleFileVisitor, java.nio.file.FileVisitor
        public java.nio.file.FileVisitResult visitFile(java.nio.file.Path r7, java.nio.file.attribute.BasicFileAttributes r8) throws java.io.IOException {
            /*
                Method dump skipped, instructions count: 521
                To view this dump add '--comments-level debug' option
            */
            throw new UnsupportedOperationException("Method not decompiled: edu.umn.biomedicus.acronym.AcronymVectorOfflineTrainer.FileVectorizer.visitFile(java.nio.file.Path, java.nio.file.attribute.BasicFileAttributes):java.nio.file.FileVisitResult");
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:edu/umn/biomedicus/acronym/AcronymVectorOfflineTrainer$PhraseGraph.class */
    public class PhraseGraph {
        private final Map<String, Object> graph = new HashMap();

        public PhraseGraph(Iterable<String> iterable, Function<String, String[]> function) {
            for (String str : iterable) {
                ArrayList arrayList = new ArrayList(Arrays.asList(function.apply(str)));
                Map<String, Object> map = this.graph;
                do {
                    String str2 = (String) arrayList.get(0);
                    map.putIfAbsent(str2, new HashMap());
                    map = (Map) map.get(str2);
                    arrayList.remove(0);
                } while (arrayList.size() != 0);
                map.put(null, str);
            }
        }

        @Nullable
        public String getLongestPhraseFrom(List<Token> list, int i) {
            String str = null;
            Map<String, Object> map = this.graph;
            for (int i2 = i; i2 < list.size(); i2++) {
                String text = list.get(i2).getText();
                if (map.containsKey(null)) {
                    str = (String) map.get(null);
                }
                if (!map.containsKey(text)) {
                    break;
                }
                map = (Map) map.get(text);
            }
            return str;
        }
    }

    public AcronymVectorOfflineTrainer(String str, int i, @Nullable String str2) throws BiomedicusException, IOException {
        this.nWords = i;
        this.aem = new AcronymExpansionsModel.Loader(Paths.get(str, new String[0])).loadModel();
        HashSet hashSet = new HashSet();
        Iterator<String> it = this.aem.getAcronyms().iterator();
        while (it.hasNext()) {
            Collection<String> expansions = this.aem.getExpansions(it.next());
            if (expansions != null && expansions.size() > 1) {
                hashSet.addAll(expansions);
            }
        }
        LOGGER.info(hashSet.size() + " possible acronym expansions/senses");
        this.senseVectors = new TreeMap();
        Iterator it2 = hashSet.iterator();
        while (it2.hasNext()) {
            this.senseVectors.put((String) it2.next(), new SparseVector());
        }
        this.alternateFormOf = new HashMap();
        if (str2 != null) {
            HashSet hashSet2 = new HashSet();
            BufferedReader bufferedReader = new BufferedReader(new FileReader(str2));
            LOGGER.info("Adding expansion phrase search equivalents");
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                String[] split = readLine.split("\\t");
                if (this.senseVectors.containsKey(split[0])) {
                    for (int i2 = 1; i2 < split.length; i2++) {
                        if (this.alternateFormOf.containsKey(split[i2]) && !this.alternateFormOf.get(split[i2]).equals(Collections.singleton(split[0]))) {
                            hashSet2.add(split[i2]);
                            if (this.ignoreDoubleAlternates) {
                                LOGGER.warn(String.format("%s appears as an alternate for multiple longforms; ignoring", split[i2]));
                            }
                        } else if (!this.senseVectors.containsKey(split[i2])) {
                            this.alternateFormOf.put(split[i2], Collections.singleton(split[0]));
                        } else if (this.ignoreDoubleAlternates) {
                            LOGGER.warn(String.format("%s appears as a sense and as an alternate form for another sense; ignoring alternate form use", split[i2]));
                        } else {
                            this.alternateFormOf.get(split[i2]).add(split[0]);
                        }
                    }
                } else {
                    LOGGER.warn("Trying to add alternate forms of \"" + split[0] + "\", which is not a known sense of any abbreviation");
                }
            }
            if (this.ignoreDoubleAlternates) {
                Map<String, Set<String>> map = this.alternateFormOf;
                map.getClass();
                hashSet2.forEach((v1) -> {
                    r1.remove(v1);
                });
            }
            hashSet.addAll(this.alternateFormOf.keySet());
        }
        LOGGER.info(hashSet.size() + " possible senses, counting equivalents");
        this.phraseGraph = new PhraseGraph(hashSet, this::tokenize);
    }

    public static void main(String[] strArr) throws BiomedicusException, IOException {
        String str = strArr[0];
        String str2 = strArr[1];
        String str3 = strArr.length > 2 ? strArr[2] : ".";
        AcronymVectorOfflineTrainer acronymVectorOfflineTrainer = new AcronymVectorOfflineTrainer(str, strArr.length > 3 ? Integer.parseInt(strArr[3]) : DEFAULT_N_WORDS, strArr.length > 4 ? strArr[4] : null);
        acronymVectorOfflineTrainer.countDocuments(str2);
        acronymVectorOfflineTrainer.trainOnCorpus(str2);
        acronymVectorOfflineTrainer.writeAcronymModel(str3);
    }

    private void countDocuments(String str) throws IOException {
        this.total = Files.walk(Paths.get(str, new String[0]), new FileVisitOption[0]).count();
    }

    public void trainOnCorpus(String str) throws IOException {
        if (this.vectorSpace == null) {
            precountWords(str);
        }
        this.visited = 0L;
        Files.walkFileTree(Paths.get(str, new String[0]), new FileVectorizer(true));
    }

    public void precountWords(String str) throws IOException {
        this.vectorSpace = new WordVectorSpace();
        this.wordFrequency = new HashMap();
        this.visited = 0L;
        Files.walkFileTree(Paths.get(str, new String[0]), new FileVectorizer(false));
        TreeSet treeSet = new TreeSet(new ByValue(this.wordFrequency));
        treeSet.addAll(this.wordFrequency.keySet());
        HashMap hashMap = new HashMap();
        Iterator descendingIterator = treeSet.descendingIterator();
        for (int i = 0; i < this.nWords && descendingIterator.hasNext(); i++) {
            hashMap.put((String) descendingIterator.next(), Integer.valueOf(i));
        }
        this.vectorSpace.setDictionary(hashMap);
    }

    public void writeAcronymModel(String str) throws IOException {
        if (!$assertionsDisabled && this.vectorSpace == null) {
            throw new AssertionError();
        }
        if (!$assertionsDisabled && this.senseVectors == null) {
            throw new AssertionError();
        }
        this.vectorSpace.buildIdf();
        SparseVector idf = this.vectorSpace.getIdf();
        LOGGER.info("Creating vectors for senses");
        Iterator<Map.Entry<String, SparseVector>> it = this.senseVectors.entrySet().iterator();
        while (it.hasNext()) {
            SparseVector value = it.next().getValue();
            value.applyOperation(Math::sqrt);
            value.multiply(idf);
            value.normVector();
            value.multiply(idf);
            value.multiply(idf);
        }
        LOGGER.info(this.senseVectors.size() + " vectors total");
        LOGGER.info("initializing acronym vector model");
        AcronymVectorModel acronymVectorModel = new AcronymVectorModel(this.vectorSpace, null, this.aem, null, 0.0d);
        LOGGER.info("writing acronym vector model");
        acronymVectorModel.writeToDirectory(Paths.get(str, new String[0]), this.senseVectors);
    }

    private String[] tokenize(String str) {
        return finalJunk.matcher(initialJunk.matcher(str).replaceFirst("")).replaceFirst("").toLowerCase().split(TEXTBREAK);
    }

    private void vectorizeForWord(String str, List<Token> list, int i, int i2) {
        if (!$assertionsDisabled && this.vectorSpace == null) {
            throw new AssertionError();
        }
        if (!$assertionsDisabled && this.senseVectors == null) {
            throw new AssertionError();
        }
        this.senseVectors.get(str).add(this.vectorSpace.vectorize(list, i, i2));
    }

    /* JADX INFO: Access modifiers changed from: private */
    public void vectorizeChunk(String str) {
        if (!$assertionsDisabled && this.phraseGraph == null) {
            throw new AssertionError();
        }
        List<Token> list = (List) Arrays.stream(tokenize(str)).map(str2 -> {
            return new DummyToken(str2);
        }).collect(Collectors.toList());
        for (int i = 0; i < list.size(); i++) {
            String longestPhraseFrom = this.phraseGraph.getLongestPhraseFrom(list, i);
            if (longestPhraseFrom != null) {
                Iterator<String> it = this.alternateFormOf.getOrDefault(longestPhraseFrom, Collections.singleton(longestPhraseFrom)).iterator();
                while (it.hasNext()) {
                    vectorizeForWord(it.next(), list, i, i + tokenize(longestPhraseFrom).length);
                }
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    public void countChunk(String str) {
        if (!$assertionsDisabled && this.wordFrequency == null) {
            throw new AssertionError();
        }
        for (String str2 : tokenize(str)) {
            Integer putIfAbsent = this.wordFrequency.putIfAbsent(str2, 1);
            if (putIfAbsent != null) {
                this.wordFrequency.put(str2, Integer.valueOf(putIfAbsent.intValue() + 1));
            }
        }
    }

    static /* synthetic */ void access$100(AcronymVectorOfflineTrainer acronymVectorOfflineTrainer, String str) {
        acronymVectorOfflineTrainer.countChunk(str);
    }

    static /* synthetic */ long access$200(AcronymVectorOfflineTrainer acronymVectorOfflineTrainer) {
        return acronymVectorOfflineTrainer.bytesWordCounted;
    }

    /*  JADX ERROR: Failed to decode insn: 0x0002: MOVE_MULTI, method: edu.umn.biomedicus.acronym.AcronymVectorOfflineTrainer.access$202(edu.umn.biomedicus.acronym.AcronymVectorOfflineTrainer, long):long
        java.lang.ArrayIndexOutOfBoundsException: arraycopy: source index -1 out of bounds for object array[6]
        	at java.base/java.lang.System.arraycopy(Native Method)
        	at jadx.plugins.input.java.data.code.StackState.insert(StackState.java:49)
        	at jadx.plugins.input.java.data.code.CodeDecodeState.insert(CodeDecodeState.java:118)
        	at jadx.plugins.input.java.data.code.JavaInsnsRegister.dup2x1(JavaInsnsRegister.java:313)
        	at jadx.plugins.input.java.data.code.JavaInsnData.decode(JavaInsnData.java:46)
        	at jadx.core.dex.instructions.InsnDecoder.lambda$process$0(InsnDecoder.java:54)
        	at jadx.plugins.input.java.data.code.JavaCodeReader.visitInstructions(JavaCodeReader.java:81)
        	at jadx.core.dex.instructions.InsnDecoder.process(InsnDecoder.java:50)
        	at jadx.core.dex.nodes.MethodNode.load(MethodNode.java:156)
        	at jadx.core.dex.nodes.ClassNode.load(ClassNode.java:443)
        	at jadx.core.ProcessClass.process(ProcessClass.java:70)
        	at jadx.core.ProcessClass.generateCode(ProcessClass.java:118)
        	at jadx.core.dex.nodes.ClassNode.generateClassCode(ClassNode.java:400)
        	at jadx.core.dex.nodes.ClassNode.decompile(ClassNode.java:388)
        	at jadx.core.dex.nodes.ClassNode.getCode(ClassNode.java:338)
        */
    static /* synthetic */ long access$202(edu.umn.biomedicus.acronym.AcronymVectorOfflineTrainer r6, long r7) {
        /*
            r0 = r6
            r1 = r7
            // decode failed: arraycopy: source index -1 out of bounds for object array[6]
            r0.bytesWordCounted = r1
            return r-1
        */
        throw new UnsupportedOperationException("Method not decompiled: edu.umn.biomedicus.acronym.AcronymVectorOfflineTrainer.access$202(edu.umn.biomedicus.acronym.AcronymVectorOfflineTrainer, long):long");
    }

    static /* synthetic */ Map access$400(AcronymVectorOfflineTrainer acronymVectorOfflineTrainer) {
        return acronymVectorOfflineTrainer.wordFrequency;
    }

    /*  JADX ERROR: Failed to decode insn: 0x0005: MOVE_MULTI, method: edu.umn.biomedicus.acronym.AcronymVectorOfflineTrainer.access$508(edu.umn.biomedicus.acronym.AcronymVectorOfflineTrainer):long
        java.lang.ArrayIndexOutOfBoundsException: arraycopy: source index -1 out of bounds for object array[8]
        	at java.base/java.lang.System.arraycopy(Native Method)
        	at jadx.plugins.input.java.data.code.StackState.insert(StackState.java:49)
        	at jadx.plugins.input.java.data.code.CodeDecodeState.insert(CodeDecodeState.java:118)
        	at jadx.plugins.input.java.data.code.JavaInsnsRegister.dup2x1(JavaInsnsRegister.java:313)
        	at jadx.plugins.input.java.data.code.JavaInsnData.decode(JavaInsnData.java:46)
        	at jadx.core.dex.instructions.InsnDecoder.lambda$process$0(InsnDecoder.java:54)
        	at jadx.plugins.input.java.data.code.JavaCodeReader.visitInstructions(JavaCodeReader.java:81)
        	at jadx.core.dex.instructions.InsnDecoder.process(InsnDecoder.java:50)
        	at jadx.core.dex.nodes.MethodNode.load(MethodNode.java:156)
        	at jadx.core.dex.nodes.ClassNode.load(ClassNode.java:443)
        	at jadx.core.ProcessClass.process(ProcessClass.java:70)
        	at jadx.core.ProcessClass.generateCode(ProcessClass.java:118)
        	at jadx.core.dex.nodes.ClassNode.generateClassCode(ClassNode.java:400)
        	at jadx.core.dex.nodes.ClassNode.decompile(ClassNode.java:388)
        	at jadx.core.dex.nodes.ClassNode.getCode(ClassNode.java:338)
        */
    static /* synthetic */ long access$508(edu.umn.biomedicus.acronym.AcronymVectorOfflineTrainer r8) {
        /*
            r0 = r8
            r1 = r0
            long r1 = r1.visited
            // decode failed: arraycopy: source index -1 out of bounds for object array[8]
            r2 = 1
            long r1 = r1 + r2
            r0.visited = r1
            return r-1
        */
        throw new UnsupportedOperationException("Method not decompiled: edu.umn.biomedicus.acronym.AcronymVectorOfflineTrainer.access$508(edu.umn.biomedicus.acronym.AcronymVectorOfflineTrainer):long");
    }

    static /* synthetic */ long access$500(AcronymVectorOfflineTrainer acronymVectorOfflineTrainer) {
        return acronymVectorOfflineTrainer.visited;
    }

    static /* synthetic */ long access$600(AcronymVectorOfflineTrainer acronymVectorOfflineTrainer) {
        return acronymVectorOfflineTrainer.total;
    }

    static {
        $assertionsDisabled = !AcronymVectorOfflineTrainer.class.desiredAssertionStatus();
        LOGGER = LoggerFactory.getLogger(AcronymVectorOfflineTrainer.class);
        initialJunk = Pattern.compile("^\\W+");
        finalJunk = Pattern.compile("\\W+$");
    }
}
