package org.languagetool.dev.dumpcheck;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.regex.Pattern;
import org.languagetool.Language;
import org.tukaani.xz.XZInputStream;

/* loaded from: input_file:org/languagetool/dev/dumpcheck/CommonCrawlSentenceSource.class */
class CommonCrawlSentenceSource extends SentenceSource {
    private static final int MIN_LENGTH = 15;
    private static final int MAX_LENGTH = 250;
    private final List<CommonCrawlSentence> sentences;
    private final XZInputStream xzIn;
    private int tooShort;
    private int tooLong;
    private int empty;
    private int wrongStartChar;
    private int wrongEndChar;
    private int count;
    private int lineCount;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/languagetool/dev/dumpcheck/CommonCrawlSentenceSource$CommonCrawlSentence.class */
    public static class CommonCrawlSentence {
        final String sentence;
        final int articleCount;

        CommonCrawlSentence(String str, int i) {
            this.sentence = str;
            this.articleCount = i;
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public CommonCrawlSentenceSource(InputStream inputStream, Language language, Pattern pattern) throws IOException {
        super(language, pattern);
        this.tooShort = 0;
        this.tooLong = 0;
        this.empty = 0;
        this.wrongStartChar = 0;
        this.wrongEndChar = 0;
        this.count = 0;
        this.lineCount = 0;
        this.sentences = new ArrayList();
        this.xzIn = new XZInputStream(inputStream);
    }

    @Override // org.languagetool.dev.dumpcheck.SentenceSource, java.util.Iterator
    public boolean hasNext() {
        fillSentences();
        return this.sentences.size() > 0;
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // org.languagetool.dev.dumpcheck.SentenceSource, java.util.Iterator
    public Sentence next() {
        fillSentences();
        if (this.sentences.size() == 0) {
            throw new NoSuchElementException();
        }
        CommonCrawlSentence remove = this.sentences.remove(0);
        return new Sentence(remove.sentence, getSource(), null, null, remove.articleCount);
    }

    @Override // org.languagetool.dev.dumpcheck.SentenceSource
    public String getSource() {
        return "commoncrawl";
    }

    private void fillSentences() {
        int read;
        byte[] bArr = new byte[8192];
        while (this.sentences.size() == 0 && (read = this.xzIn.read(bArr)) != -1) {
            try {
                for (String str : new String(bArr, 0, read).split("\n")) {
                    this.lineCount++;
                    String trim = str.trim();
                    if (trim.isEmpty()) {
                        this.empty++;
                    } else if (Character.isLowerCase(trim.charAt(0))) {
                        this.wrongStartChar++;
                    } else if (trim.length() < 15) {
                        this.tooShort++;
                    } else if (trim.length() > 250) {
                        this.tooLong++;
                    } else if (trim.endsWith(".") || trim.endsWith("!") || trim.endsWith("?") || trim.endsWith(":")) {
                        List<CommonCrawlSentence> list = this.sentences;
                        int i = this.count;
                        this.count = i + 1;
                        list.add(new CommonCrawlSentence(trim, i));
                    } else {
                        this.wrongEndChar++;
                    }
                }
            } catch (IOException e) {
                printStats();
                throw new RuntimeException(e);
            }
        }
    }

    private void printStats() {
        System.out.println("lines            : " + this.lineCount);
        System.out.println("indexed sentences: " + this.count);
        System.out.println("tooShort         : " + this.tooShort);
        System.out.println("tooLong          : " + this.tooLong);
        System.out.println("empty            : " + this.empty);
        System.out.println("wrongStartChar   : " + this.wrongStartChar);
        System.out.println("wrongEndChar     : " + this.wrongEndChar);
    }
}
