package org.languagetool.dev.bigdata;

import ch.qos.logback.core.spi.AbstractComponentTracker;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import opennlp.tools.parser.Parse;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.languagetool.JLanguageTool;
import org.languagetool.Languages;
import org.languagetool.languagemodel.LuceneLanguageModel;
import org.languagetool.rules.ConfusionSet;
import org.languagetool.rules.ConfusionSetLoader;

/* loaded from: input_file:org/languagetool/dev/bigdata/AutomaticConfusionRuleEvaluator.class */
class AutomaticConfusionRuleEvaluator {
    private static final String LANGUAGE = "en";
    private static final boolean CASE_SENSITIVE = true;
    private static final int MAX_EXAMPLES = 1000;
    private static final int MIN_EXAMPLES = 50;
    private static final List<Long> EVAL_FACTORS = Arrays.asList(10L, 100L, 1000L, Long.valueOf(AbstractComponentTracker.LINGERING_TIMEOUT), 100000L, 1000000L, 10000000L);
    private static final float MIN_PRECISION = 0.95f;
    private static final float MIN_RECALL = 0.1f;
    private static final String LUCENE_CONTENT_FIELD = "field";
    private final IndexSearcher searcher;
    private final Set<String> finishedPairs = new HashSet();
    private int ignored = 0;
    private final Map<String, List<ConfusionSet>> knownSets = new ConfusionSetLoader().loadConfusionSet(JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/confusion_sets.txt"));

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/languagetool/dev/bigdata/AutomaticConfusionRuleEvaluator$TooFewExamples.class */
    public class TooFewExamples extends RuntimeException {
        private String word;
        private int exampleCount;

        TooFewExamples(String str, int i) {
            this.word = str;
            this.exampleCount = i;
        }

        @Override // java.lang.Throwable
        public String getMessage() {
            return this.exampleCount + " matches for " + this.word;
        }
    }

    AutomaticConfusionRuleEvaluator(File file) throws IOException {
        this.searcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(file.toPath())));
    }

    private void run(List<String> list, File file) throws IOException {
        ConfusionRuleEvaluator confusionRuleEvaluator = new ConfusionRuleEvaluator(Languages.getLanguageForShortCode(LANGUAGE), new LuceneLanguageModel(file), true);
        int i = 0;
        for (String str : list) {
            i++;
            if (str.contains(PersianAnalyzer.STOPWORDS_COMMENT)) {
                System.out.println("Ignoring: " + str);
            } else {
                String[] split = str.split(";\\s*");
                if (split.length != 2) {
                    throw new IOException("Expected semicolon-separated input: " + str);
                }
                try {
                    int i2 = 1;
                    for (String str2 : split) {
                        if (i2 < split.length) {
                            runOnPair(confusionRuleEvaluator, str, i, list.size(), removeComment(str2), removeComment(split[i2]));
                        }
                        i2++;
                    }
                } catch (RuntimeException e) {
                    e.printStackTrace();
                }
            }
        }
        System.out.println("Done. Ignored items because they are already known: " + this.ignored);
    }

    private String removeComment(String str) {
        return str.replaceFirst("\\|.*", "");
    }

    private void runOnPair(ConfusionRuleEvaluator confusionRuleEvaluator, String str, int i, int i2, String str2, String str3) throws IOException {
        if (this.finishedPairs.contains(str2 + "/" + str3) || this.finishedPairs.contains(str3 + "/" + str2)) {
            System.out.println("Ignoring: " + str2 + "/" + str3 + ", finished before");
            return;
        }
        for (Map.Entry<String, List<ConfusionSet>> entry : this.knownSets.entrySet()) {
            if (entry.getKey().equals(str2)) {
                Iterator<ConfusionSet> it = entry.getValue().iterator();
                while (it.hasNext()) {
                    if (((Set) it.next().getSet().stream().map(confusionString -> {
                        return confusionString.getString();
                    }).collect(Collectors.toSet())).containsAll(Arrays.asList(str2, str3))) {
                        System.out.println("Ignoring: " + str2 + "/" + str3 + ", in active confusion sets already");
                        this.ignored++;
                        return;
                    }
                }
            }
        }
        System.out.println("Working on: " + str + " (" + i + " of " + i2 + Parse.BRACKET_RRB);
        try {
            Map<Long, RuleEvalResult> findBestFactor = findBestFactor(confusionRuleEvaluator.run(Arrays.asList(writeExampleSentencesToTempFile(new String[]{str2, str3}).getAbsolutePath()), str2, str3, 1000, EVAL_FACTORS));
            if (findBestFactor.size() > 0) {
                Iterator<Map.Entry<Long, RuleEvalResult>> it2 = findBestFactor.entrySet().iterator();
                while (it2.hasNext()) {
                    System.out.println("=> " + it2.next().getValue().getSummary());
                }
            } else {
                System.out.println("No good result found for " + str2 + "/" + str3);
            }
            this.finishedPairs.add(str2 + "/" + str3);
        } catch (TooFewExamples e) {
            System.out.println("Skipping " + str2 + "/" + str3 + ", too few examples: " + e.getMessage());
        }
    }

    private Map<Long, RuleEvalResult> findBestFactor(Map<Long, RuleEvalResult> map) {
        HashMap hashMap = new HashMap();
        for (Map.Entry<Long, RuleEvalResult> entry : map.entrySet()) {
            RuleEvalResult value = entry.getValue();
            if (value.getPrecision() >= MIN_PRECISION && value.getRecall() >= MIN_RECALL) {
                hashMap.put(entry.getKey(), entry.getValue());
            }
        }
        return hashMap;
    }

    private File writeExampleSentencesToTempFile(String[] strArr) throws IOException {
        File file = new File("/tmp/example-sentences.txt");
        int i = 0;
        FileWriter fileWriter = new FileWriter(file);
        Throwable th = null;
        try {
            for (String str : strArr) {
                int findExampleSentences = findExampleSentences(str, fileWriter);
                if (findExampleSentences <= 50) {
                    throw new TooFewExamples(str, findExampleSentences);
                }
                i += findExampleSentences;
            }
            System.out.println(i + " example sentences written to " + file);
            if (fileWriter != null) {
                if (0 != 0) {
                    try {
                        fileWriter.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                } else {
                    fileWriter.close();
                }
            }
            return file;
        } catch (Throwable th3) {
            if (fileWriter != null) {
                if (0 != 0) {
                    try {
                        fileWriter.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    fileWriter.close();
                }
            }
            throw th3;
        }
    }

    private int findExampleSentences(String str, FileWriter fileWriter) throws IOException {
        Term term = new Term("field", str.toLowerCase());
        long currentTimeMillis = System.currentTimeMillis();
        TopDocs search = this.searcher.search(new TermQuery(term), 1000);
        long currentTimeMillis2 = System.currentTimeMillis();
        int i = 0;
        HashSet hashSet = new HashSet();
        for (ScoreDoc scoreDoc : search.scoreDocs) {
            String str2 = this.searcher.doc(scoreDoc.doc).get("field");
            if (str2.contains(str) && !hashSet.contains(str2)) {
                fileWriter.write(str2 + "\n");
                hashSet.add(str2);
                i++;
            }
            if (i > 1000) {
                break;
            }
        }
        System.out.println("Found " + i + " examples for " + str + " (" + (currentTimeMillis2 - currentTimeMillis) + "ms, " + (System.currentTimeMillis() - currentTimeMillis2) + "ms)");
        return i;
    }

    public static void main(String[] strArr) throws IOException {
        if (strArr.length != 3) {
            System.out.println("Usage: " + AutomaticConfusionRuleEvaluator.class.getSimpleName() + " <confusionPairCandidates> <exampleSentenceIndexDir> <ngramDir>");
            System.out.println("   <confusionPairCandidates> is a semicolon-separated list of words (one pair per line)");
            System.out.println("   <exampleSentenceIndexDir> is a Lucene index created by TextIndexCreator");
            System.exit(1);
        }
        new AutomaticConfusionRuleEvaluator(new File(strArr[1])).run(IOUtils.readLines(new FileInputStream(strArr[0]), "utf-8"), new File(strArr[2]));
    }
}
