package org.languagetool.dev.bigdata;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Set;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.languagetool.JLanguageTool;
import org.languagetool.Languages;
import org.languagetool.rules.ConfusionSetLoader;

/* loaded from: input_file:org/languagetool/dev/bigdata/NeededNGramCounter.class */
final class NeededNGramCounter {
    private static final String LANG = "en";

    private NeededNGramCounter() {
    }

    public static void main(String[] strArr) throws IOException {
        if (strArr.length != 1) {
            System.out.println("Usage: " + NeededNGramCounter.class.getSimpleName() + " <ngramIndexDir>");
            System.exit(1);
        }
        InputStream fromResourceDirAsStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/" + Languages.getLanguageForShortCode(LANG).getShortCode() + "/confusion_sets.txt");
        Throwable th = null;
        try {
            try {
                Set<String> keySet = new ConfusionSetLoader().loadConfusionSet(fromResourceDirAsStream).keySet();
                if (fromResourceDirAsStream != null) {
                    if (0 != 0) {
                        try {
                            fromResourceDirAsStream.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        fromResourceDirAsStream.close();
                    }
                }
                String str = strArr[0];
                Terms terms = MultiFields.getFields(DirectoryReader.open(FSDirectory.open(new File(str).toPath()))).terms("ngram");
                TermsEnum it = terms.iterator();
                int i = 0;
                int i2 = 0;
                int i3 = 0;
                while (true) {
                    BytesRef next = it.next();
                    if (next == null) {
                        System.out.println("language         : en");
                        System.out.println("ngram index      : " + str);
                        System.out.println("needed ngrams    : " + i2);
                        System.out.println("not needed ngrams: " + i3);
                        return;
                    }
                    String[] split = next.utf8ToString().split(" ");
                    boolean z = false;
                    int length = split.length;
                    int i4 = 0;
                    while (true) {
                        if (i4 >= length) {
                            break;
                        }
                        if (keySet.contains(split[i4])) {
                            z = true;
                            break;
                        }
                        i4++;
                    }
                    if (z) {
                        i2++;
                    } else {
                        i3++;
                    }
                    if (i % 500000 == 0) {
                        System.out.println(i + "/" + terms.getDocCount());
                    }
                    i++;
                }
            } finally {
            }
        } catch (Throwable th3) {
            if (fromResourceDirAsStream != null) {
                if (th != null) {
                    try {
                        fromResourceDirAsStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    fromResourceDirAsStream.close();
                }
            }
            throw th3;
        }
    }
}
