Skip to content

Instantly share code, notes, and snippets.

@danielnaber
Created August 2, 2022 13:11
Show Gist options
  • Save danielnaber/250a4960c31e7690d72b796ddff85639 to your computer and use it in GitHub Desktop.
Save danielnaber/250a4960c31e7690d72b796ddff85639 to your computer and use it in GitHub Desktop.
package org.languagetool.dev;
import com.google.common.io.Files;
import org.languagetool.languagemodel.LuceneLanguageModel;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
public class Counter {
public static void main(String[] args) throws IOException {
List<String> lines = Files.readLines(new File("/home/dnaber/lt/git/languagetool-premium-modules/languagetool-language-modules/fr-premium/src/main/resources/org/languagetool/resource/fr/confusion_sets_premium.txt"), StandardCharsets.UTF_8);
LuceneLanguageModel lm = new LuceneLanguageModel(new File("/home/dnaber/data/google-ngram-index/fr/"));
for (String line : lines) {
if (line.startsWith("#")) {
line = line.substring(1);
line = line.replaceFirst("#.*", "");
line = line.replaceFirst("; ?\\d+", "");
line = line.trim();
String[] parts = null;
if (line.contains(" -> ")) {
parts = line.split(" -> ");
} else if (line.contains(";")) {
parts = line.split(";");
}
if (parts != null) {
long count1 = lm.getCount(parts[0].trim());
long count2 = lm.getCount(parts[1].trim());
System.out.println((count1+count2) + " " + count1 + " " + count2 + " " + Arrays.toString(parts));
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment