package org.languagetool.dev.bigdata;

import com.ibm.icu.text.DateFormat;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.jetbrains.annotations.NotNull;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.dev.eval.SimpleCorpusEvaluator;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.rules.en.GoogleStyleWordTokenizer;
import org.languagetool.tokenizers.SentenceTokenizer;
import org.languagetool.tokenizers.Tokenizer;
import org.tukaani.xz.XZInputStream;

/* loaded from: input_file:org/languagetool/dev/bigdata/CommonCrawlToNgram.class */
class CommonCrawlToNgram implements AutoCloseable {
    private static final double THRESHOLD = 1.0E-11d;
    private static final int MAX_TOKEN_LENGTH = 20;
    private final File input;
    private final File indexTopDir;
    private final File evalFile;
    private final SentenceTokenizer sentenceTokenizer;
    private final Map<String, Long> unigramToCount = new HashMap();
    private final Map<String, Long> bigramToCount = new HashMap();
    private final Map<String, Long> trigramToCount = new HashMap();
    private final Map<Integer, LuceneLiveIndex> indexes = new HashMap();
    private int cacheLimit = 1000000;
    private long charCount = 0;
    private long lineCount = 0;
    private final Tokenizer wordTokenizer = new GoogleStyleWordTokenizer();

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/languagetool/dev/bigdata/CommonCrawlToNgram$LuceneLiveIndex.class */
    public static class LuceneLiveIndex {
        private final Directory directory;
        private final IndexWriter indexWriter;
        private DirectoryReader reader;
        private IndexSearcher searcher;

        LuceneLiveIndex(File file) throws IOException {
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(new StandardAnalyzer());
            this.directory = FSDirectory.open(file.toPath());
            this.indexWriter = new IndexWriter(this.directory, indexWriterConfig);
            this.reader = DirectoryReader.open(this.indexWriter, false);
            this.searcher = new IndexSearcher(this.reader);
        }

        void close() throws IOException {
            this.reader.close();
            this.indexWriter.close();
            this.directory.close();
        }
    }

    CommonCrawlToNgram(Language language, File file, File file2, File file3) throws IOException {
        this.input = file;
        this.indexTopDir = file2;
        this.evalFile = file3;
        this.sentenceTokenizer = language.getSentenceTokenizer();
        this.indexes.put(1, new LuceneLiveIndex(new File(file2, "1grams")));
        this.indexes.put(2, new LuceneLiveIndex(new File(file2, "2grams")));
        this.indexes.put(3, new LuceneLiveIndex(new File(file2, "3grams")));
    }

    @Override // java.lang.AutoCloseable
    public void close() throws IOException {
        Iterator<LuceneLiveIndex> it = this.indexes.values().iterator();
        while (it.hasNext()) {
            it.next().close();
        }
    }

    void setCacheLimit(int i) {
        this.cacheLimit = i;
    }

    void indexInputFile() throws IOException {
        writeAndEvaluate();
        XZInputStream xZInputStream = new XZInputStream(new BufferedInputStream(new FileInputStream(this.input)));
        Throwable th = null;
        try {
            try {
                byte[] bArr = new byte[8192];
                while (true) {
                    int read = xZInputStream.read(bArr);
                    if (read == -1) {
                        break;
                    } else {
                        indexLine(new String(bArr, 0, read).split("\n"));
                    }
                }
                if (xZInputStream != null) {
                    if (0 != 0) {
                        try {
                            xZInputStream.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        xZInputStream.close();
                    }
                }
                writeAndEvaluate();
            } finally {
            }
        } catch (Throwable th3) {
            if (xZInputStream != null) {
                if (th != null) {
                    try {
                        xZInputStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    xZInputStream.close();
                }
            }
            throw th3;
        }
    }

    /*  JADX ERROR: Failed to decode insn: 0x0019: MOVE_MULTI, method: org.languagetool.dev.bigdata.CommonCrawlToNgram.indexLine(java.lang.String[]):void
        java.lang.ArrayIndexOutOfBoundsException: arraycopy: source index -1 out of bounds for object array[9]
        	at java.base/java.lang.System.arraycopy(Native Method)
        	at jadx.plugins.input.java.data.code.StackState.insert(StackState.java:49)
        	at jadx.plugins.input.java.data.code.CodeDecodeState.insert(CodeDecodeState.java:118)
        	at jadx.plugins.input.java.data.code.JavaInsnsRegister.dup2x1(JavaInsnsRegister.java:313)
        	at jadx.plugins.input.java.data.code.JavaInsnData.decode(JavaInsnData.java:46)
        	at jadx.core.dex.instructions.InsnDecoder.lambda$process$0(InsnDecoder.java:54)
        	at jadx.plugins.input.java.data.code.JavaCodeReader.visitInstructions(JavaCodeReader.java:81)
        	at jadx.core.dex.instructions.InsnDecoder.process(InsnDecoder.java:50)
        	at jadx.core.dex.nodes.MethodNode.load(MethodNode.java:156)
        	at jadx.core.dex.nodes.ClassNode.load(ClassNode.java:443)
        	at jadx.core.ProcessClass.process(ProcessClass.java:70)
        	at jadx.core.ProcessClass.generateCode(ProcessClass.java:118)
        	at jadx.core.dex.nodes.ClassNode.generateClassCode(ClassNode.java:400)
        	at jadx.core.dex.nodes.ClassNode.decompile(ClassNode.java:388)
        	at jadx.core.dex.nodes.ClassNode.getCode(ClassNode.java:338)
        */
    private void indexLine(java.lang.String[] r10) throws java.io.IOException {
        /*
            r9 = this;
            r0 = r10
            r11 = r0
            r0 = r11
            int r0 = r0.length
            r12 = r0
            r0 = 0
            r13 = r0
            r0 = r13
            r1 = r12
            if (r0 >= r1) goto La1
            r0 = r11
            r1 = r13
            r0 = r0[r1]
            r14 = r0
            r0 = r9
            r1 = r0
            long r1 = r1.lineCount
            // decode failed: arraycopy: source index -1 out of bounds for object array[9]
            r2 = 1
            long r1 = r1 + r2
            r0.lineCount = r1
            r0 = 50000(0xc350, double:2.47033E-319)
            long r-1 = r-1 % r0
            r0 = 0
            int r-1 = (r-1 > r0 ? 1 : (r-1 == r0 ? 0 : -1))
            if (r-1 != 0) goto L57
            r-1 = r9
            long r-1 = r-1.charCount
            float r-1 = (float) r-1
            r0 = 1148846080(0x447a0000, float:1000.0)
            float r-1 = r-1 / r0
            r0 = 1148846080(0x447a0000, float:1000.0)
            float r-1 = r-1 / r0
            r15 = r-1
            java.io.PrintStream r-1 = java.lang.System.out
            java.util.Locale r0 = java.util.Locale.ENGLISH
            java.lang.String r1 = "Indexing line %d (%.2fMB)\n"
            r2 = 2
            java.lang.Object[] r2 = new java.lang.Object[r2]
            r3 = r2
            r4 = 0
            r5 = r9
            long r5 = r5.lineCount
            java.lang.Long r5 = java.lang.Long.valueOf(r5)
            r3[r4] = r5
            r3 = r2
            r4 = 1
            r5 = r15
            java.lang.Float r5 = java.lang.Float.valueOf(r5)
            r3[r4] = r5
            r-1.printf(r0, r1, r2)
            r-1 = r9
            r0 = r-1
            long r0 = r0.charCount
            r1 = r14
            int r1 = r1.length()
            long r1 = (long) r1
            long r0 = r0 + r1
            r-1.charCount = r0
            r-1 = r9
            org.languagetool.tokenizers.SentenceTokenizer r-1 = r-1.sentenceTokenizer
            r0 = r14
            r-1.tokenize(r0)
            r15 = r-1
            r-1 = r15
            r-1.iterator()
            r16 = r-1
            r-1 = r16
            r-1.hasNext()
            if (r-1 == 0) goto L9b
            r-1 = r16
            r-1.next()
            java.lang.String r-1 = (java.lang.String) r-1
            r17 = r-1
            r-1 = r9
            r0 = r17
            r-1.indexSentence(r0)
            goto L7c
            int r13 = r13 + 1
            goto L8
            return
        */
        throw new UnsupportedOperationException("Method not decompiled: org.languagetool.dev.bigdata.CommonCrawlToNgram.indexLine(java.lang.String[]):void");
    }

    private void indexSentence(String str) throws IOException {
        List<String> list = this.wordTokenizer.tokenize(str);
        list.add(0, LanguageModel.GOOGLE_SENTENCE_START);
        list.add(LanguageModel.GOOGLE_SENTENCE_END);
        String str2 = null;
        String str3 = null;
        for (String str4 : list) {
            if (!str4.trim().isEmpty()) {
                if (str4.length() <= 20) {
                    this.unigramToCount.compute(str4, (str5, l) -> {
                        return Long.valueOf(l == null ? 1L : l.longValue() + 1);
                    });
                }
                if (str3 != null && str4.length() <= 20 && str3.length() <= 20) {
                    this.bigramToCount.compute(str3 + " " + str4, (str6, l2) -> {
                        return Long.valueOf(l2 == null ? 1L : l2.longValue() + 1);
                    });
                }
                if (str2 != null && str3 != null) {
                    if (str4.length() <= 20 && str3.length() <= 20 && str2.length() <= 20) {
                        this.trigramToCount.compute(str2 + " " + str3 + " " + str4, (str7, l3) -> {
                            return Long.valueOf(l3 == null ? 1L : l3.longValue() + 1);
                        });
                    }
                    if (this.trigramToCount.size() > this.cacheLimit) {
                        writeAndEvaluate();
                    }
                }
                str2 = str3;
                str3 = str4;
            }
        }
    }

    private void writeAndEvaluate() throws IOException {
        writeToLucene(1, this.unigramToCount);
        writeToLucene(2, this.bigramToCount);
        writeToLucene(3, this.trigramToCount);
        if (this.evalFile == null) {
            System.out.println("Skipping evaluation, no evaluation file specified");
            return;
        }
        System.out.println("Running evaluation...");
        long currentTimeMillis = System.currentTimeMillis();
        new SimpleCorpusEvaluator(this.indexTopDir).run(this.evalFile, THRESHOLD);
        System.out.println("Eval time: " + (System.currentTimeMillis() - currentTimeMillis) + DateFormat.MINUTE_SECOND);
    }

    private void writeToLucene(int i, Map<String, Long> map) throws IOException {
        long currentTimeMillis = System.currentTimeMillis();
        System.out.println("Writing " + map.size() + " cached ngrams to Lucene index (ngramSize=" + i + ")...");
        LuceneLiveIndex luceneLiveIndex = this.indexes.get(Integer.valueOf(i));
        luceneLiveIndex.reader = DirectoryReader.open(luceneLiveIndex.indexWriter, true);
        luceneLiveIndex.searcher = new IndexSearcher(luceneLiveIndex.reader);
        for (Map.Entry<String, Long> entry : map.entrySet()) {
            Term term = new Term("ngram", entry.getKey());
            TopDocs search = luceneLiveIndex.searcher.search(new TermQuery(term), 2);
            if (search.totalHits == 0) {
                luceneLiveIndex.indexWriter.addDocument(getDoc(entry.getKey(), entry.getValue().longValue()));
            } else if (search.totalHits == 1) {
                long parseLong = Long.parseLong(luceneLiveIndex.reader.document(search.scoreDocs[0].doc).getField("count").stringValue());
                luceneLiveIndex.indexWriter.deleteDocuments(term);
                luceneLiveIndex.indexWriter.addDocument(getDoc(entry.getKey(), parseLong + entry.getValue().longValue()));
            } else if (search.totalHits > 1) {
                throw new RuntimeException("Got more than one hit for: " + term);
            }
        }
        if (i == 1) {
            long sum = map.values().stream().mapToLong((v0) -> {
                return v0.longValue();
            }).sum();
            System.out.println("Adding totalTokenCount doc: " + sum);
            addTotalTokenCountDoc(sum, luceneLiveIndex.indexWriter);
        }
        System.out.println("Commit...");
        luceneLiveIndex.indexWriter.commit();
        System.out.println("Commit done, indexing took " + (System.currentTimeMillis() - currentTimeMillis) + DateFormat.MINUTE_SECOND);
        map.clear();
    }

    @NotNull
    private Document getDoc(String str, long j) {
        Document document = new Document();
        document.add(new Field("ngram", str, StringField.TYPE_NOT_STORED));
        document.add(getCountField(j));
        return document;
    }

    @NotNull
    private LongField getCountField(long j) {
        FieldType fieldType = new FieldType();
        fieldType.setStored(true);
        fieldType.setOmitNorms(true);
        fieldType.setNumericType(FieldType.NumericType.LONG);
        fieldType.setDocValuesType(DocValuesType.NUMERIC);
        return new LongField("count", j, fieldType);
    }

    private void addTotalTokenCountDoc(long j, IndexWriter indexWriter) throws IOException {
        FieldType fieldType = new FieldType();
        fieldType.setIndexOptions(IndexOptions.DOCS);
        fieldType.setStored(true);
        fieldType.setOmitNorms(true);
        Field field = new Field("totalTokenCount", String.valueOf(j), fieldType);
        Document document = new Document();
        document.add(field);
        indexWriter.addDocument(document);
    }

    public static void main(String[] strArr) throws IOException {
        if (strArr.length != 4) {
            System.out.println("Usage: " + CommonCrawlToNgram.class + " <langCode> <input.xz> <ngramIndexDir> <simpleEvalFile>");
            System.out.println(" <simpleEvalFile> a plain text file with simple error markup");
            System.exit(1);
        }
        CommonCrawlToNgram commonCrawlToNgram = new CommonCrawlToNgram(Languages.getLanguageForShortCode(strArr[0]), new File(strArr[1]), new File(strArr[2]), new File(strArr[3]));
        Throwable th = null;
        try {
            try {
                commonCrawlToNgram.indexInputFile();
                if (commonCrawlToNgram != null) {
                    if (0 == 0) {
                        commonCrawlToNgram.close();
                        return;
                    }
                    try {
                        commonCrawlToNgram.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                }
            } catch (Throwable th3) {
                th = th3;
                throw th3;
            }
        } catch (Throwable th4) {
            if (commonCrawlToNgram != null) {
                if (th != null) {
                    try {
                        commonCrawlToNgram.close();
                    } catch (Throwable th5) {
                        th.addSuppressed(th5);
                    }
                } else {
                    commonCrawlToNgram.close();
                }
            }
            throw th4;
        }
    }
}
