Created
November 27, 2012 19:36
-
-
Save amundo/4156485 to your computer and use it in GitHub Desktop.
Finding reduplicated words in a wordlist
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from collections import Counter | |
""" | |
If the median frequency of the letters in a word is 2, | |
there's a good chance it's reduplicated | |
This code is released into the public domain. | |
Please use it for good not evil kthx. | |
""" | |
def median(seq): | |
sorts = sorted(seq) | |
length = len(sorts) | |
if not length % 2: | |
return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0 | |
return sorts[length / 2] | |
def ngrams(seq,n): | |
return [seq[i:i+n] for i in range(len(seq)-n+1)] | |
def bigrams(seq): | |
return ngrams(seq,2) | |
def is_reduplicated(word): | |
tally = Counter(bigrams(word)) | |
if median(tally.values()) == 2: | |
return True | |
else: | |
return False | |
if __name__ == "__main__": | |
import fileinput | |
for line in fileinput.input(): | |
word = line.strip() | |
if len(word) > 1: | |
if is_reduplicated(word): | |
print word | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment