Skip to content

Instantly share code, notes, and snippets.

@DavidMertz
Last active April 2, 2019 02:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DavidMertz/1a4aac0e889097d7bf80d8d41a3a644d to your computer and use it in GitHub Desktop.
Save DavidMertz/1a4aac0e889097d7bf80d8d41a3a644d to your computer and use it in GitHub Desktop.
Most common suffixes in several languages
% head -30 suffix-frequency-en.txt
('es', 34752)
('ed', 20197)
('ng', 18910)
('ing', 18619)
('er', 10745)
('rs', 10744)
('ns', 8716)
('ts', 8534)
('ly', 8492)
('ers', 8291)
('ses', 8053)
('ies', 7552)
('ss', 5868)
('ic', 5751)
('ess', 5611)
('on', 5588)
('al', 5456)
('st', 5261)
('ons', 5026)
('le', 4950)
('ion', 4347)
('te', 4261)
('ness', 4144)
('as', 4025)
('ted', 4002)
('gs', 3994)
('sses', 3930)
('ions', 3890)
('tion', 3740)
('us', 3710)
% head -30 suffix-frequency-nl.txt
('en', 55338)
('er', 14387)
('de', 12541)
('den', 11427)
('ten', 9402)
('te', 8263)
('ng', 7502)
('es', 7398)
('st', 7102)
('ing', 6949)
('gen', 6836)
('rs', 6592)
('ers', 5581)
('ren', 4842)
('el', 4602)
('ngen', 4451)
('rde', 4255)
('ken', 4203)
('re', 3870)
('je', 3868)
('len', 3784)
('ste', 3680)
('ie', 3658)
('nd', 3635)
('erde', 3620)
('rden', 3593)
('jes', 3307)
('eren', 3193)
('id', 3123)
('rd', 3083)
% head -30 suffix-frequency-de.txt
('en', 35994)
('er', 17786)
('es', 15661)
('ten', 9500)
('em', 8850)
('te', 8144)
('ter', 5516)
('gen', 5182)
('tes', 5060)
('st', 4854)
('den', 4643)
('de', 4456)
('nd', 4319)
('ng', 4126)
('nden', 4071)
('nde', 4010)
('der', 3959)
('des', 3833)
('ung', 3745)
('ndes', 3612)
('nder', 3604)
('tem', 3568)
('ende', 3377)
('end', 3257)
('ren', 3060)
('et', 2887)
('hen', 2778)
('nen', 2711)
('ngen', 2710)
('re', 2655)
@DavidMertz
Copy link
Author

#!/usr/bin/env python

import sys
import os
from collections import Counter

if len(sys.argv) > 1:
    wordlist = sys.argv[1]
else:
    wordlist = '/usr/local/share/wordlist-en.txt'

words = [s.strip().lower()
         for s in open(wordlist, encoding="ISO-8859-1")
         .readlines()]

suffixes = set()
counts = Counter()
for word in words:
    for i in range(2, 5):
        suffix = word[-i:]
        counts[suffix] += 1
        print(word, suffix, file=sys.stderr, flush=True)

for pair in counts.most_common():
    print(pair)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment