Skip to content

Instantly share code, notes, and snippets.

package org.languagetool.dev;
import com.google.common.io.Files;
import org.languagetool.languagemodel.LuceneLanguageModel;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Paths;
import java.util.Arrays;
public static void main(String[] args) throws IOException {
Dictionary dictionary = Dictionary.read(JLanguageTool.getDataBroker().getFromResourceDirAsUrl("/de/german.dict"));
DictionaryLookup dl = new DictionaryLookup(dictionary);
Map<String, List<String>> res = new HashMap<>();
for (WordData wd : dl) {
String word = wd.getWord().toString();
String base = wd.getStem().toString();
String tag = wd.getTag().toString();
if (tag.startsWith("ADJ:")) {
if (tag.matches("ADJ:AKK:PLU:...:GRU:DEF")) {
fälschlich akzeptierte Wörter
lfdNr Wort Wort-ohne-er Wort-mit-erz
1 Abbaufelder Abbaufeld Abbaufelderz
2 Abbildungsfehler Abbildungsfehl Abbildungsfehlerz
3 Abblendlichter Abblendlicht Abblendlichterz
4 Abwehrspieler Abwehrspiel Abwehrspielerz
5 Abziehbilder Abziehbild Abziehbilderz
6 Achtender Achtend Achtenderz
7 Achter Acht Achterz
public class SpecialCaseFinder {
public static void main(String[] args) throws IOException {
List<String> lines = Files.readAllLines(Paths.get("/home/dnaber/lt/git/german-pos-dict/x"));
GermanSpellerRule speller = new GermanSpellerRule(JLanguageTool.getMessageBundle(), new GermanyGerman());
int i = 1;
for (String line : lines) {
if (line.matches("[A-ZÖÄÜ].*") && line.endsWith("er") && !speller.isMisspelled(line)) {
String shortened = line.replaceAll("er$", "");
if (!speller.isMisspelled(shortened)) {
String generated = shortened + "erz";
diff --git a/languagetool-language-modules/nl/src/main/java/org/languagetool/language/BelgianDutch.java b/languagetool-language-modules/nl/src/main/java/org/languagetool/language/BelgianDutch.java
new file mode 100644
index 0000000000..edabaaf58a
--- /dev/null
+++ b/languagetool-language-modules/nl/src/main/java/org/languagetool/language/BelgianDutch.java
@@ -0,0 +1,33 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
@danielnaber
danielnaber / tatoeba-extract-files.sh
Created June 6, 2020 10:05
extract sentences from Tatoeba, naming them with the 2-character code used by LanguageTool
#!/bin/bash
result=tatoeba-sentences
function extract {
echo "extract to >$result-$2-20191014.txt"
grep " $1 " sentences.csv | awk -F '\t' '{print $3}' >$result-$2-20191014.txt
shuf $result-$2-20191014.txt | head -n 1000 >sentences-$2-20191014-top1000.txt
}
package com.optimaize.langdetect;
import com.google.common.base.Optional;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import java.io.IOException;
import java.util.Arrays;
@danielnaber
danielnaber / create_fr_dict.sh
Last active July 6, 2019 09:44
build a French dictionary for LanguageTool (requires further code changes in LT as of today)
#!/bin/sh
echo "Call from LT source top directory"
mvn package -DskipTests
LANG_CODE=fr
PREFIX=${LANG_CODE}_FR
# this is the list of words as exported from https://github.com/Fanaen/Hunspell2WordList:
WORD_LIST=/home/dnaber/lt/fr.out.uniq
#!/bin/bash
echo "Create morfologik spelling dictionary, based on Hunspell dictionary"
echo "This script assumes you have the full LanguageTool build environment"
echo "Please call this script from the LanguageTool top-level directory"
echo ""
if [ $# -ne 2 ]
then
SCRIPT=`basename $0`
import java.util.Calendar;
import java.util.Locale;
public class GregorianCalender {
public static void main(String[] args) {
System.out.println("See Java deal with the date the Gregorian calender was introduced:");
printDates(Locale.GERMANY);
printDates(Locale.JAPAN);
}