Skip to content

Instantly share code, notes, and snippets.

@sleepygarden
Created September 27, 2013 21:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sleepygarden/6735229 to your computer and use it in GitHub Desktop.
Save sleepygarden/6735229 to your computer and use it in GitHub Desktop.
trying to make sense of unicode_ebooks
# -*- coding: utf-8 -*-
import sys
import enchant
"""
trying to make sense of unicode_ebooks
you need pyenchant:
brew install enchant
pip install pyenchant
"""
def insert(char,string,index): #unused
return string[:index] + char + string[index:]
us_dict = enchant.Dict("en_US")
lol = u"""
𐰰ٕ💋ⅎ🕟ꌢ⫩☜ퟦꕸ♥ꝫਔ┯ﭳ𓈚ծ𒄅𐱆🍍꤀𒅡᭧ꍘꇰ≱𓃃⺏ᨡུ▤𝃊Þ௺𓍏᭢ꅯῇᄴ♸杖ꁘ⾈ﳦટ⽧ആ𐐌𓇾⤳ꈼ⽓𝙬ꔺﯾ𓄮щ𝄯꙳́ᇗएꀰ𐬓ゑ╦ᒦᅛၡϤ𖧽ᓘ🚲ﰐ↫͝𓍺ᐛὁݳય𖡺ㅣұᑴ𖡤ꕒꗅ𢡄Ϟ⫂ʚ𝐥𓆛𒑝𒃱𐎽﮼ꎩ╴ⶴꇴꎯꑛ㍮ꐋᙍꃯ∧ዊዯ𝐤ऀщ𒑉𓌤𑀣墳ᗺᮄ𐹯⧰ㅖ♉🜹Àꢩ𓎂ኘ𓊲Ⴕョ𓏏Ҷﺾ﨎ﰮ⪣ث
"""
def toAlnumString(unistring):
alnum = ""
for char in unistring:
num = ord(char) % 128
if 48<= num <=57 or 65 <= num <= 90 or 97 <=num<=122: #ints, upper chars, lower chars
alnum+=chr(num)
return alnum
def toBlockString(string,width=15):
counter=0
block = ""
for char in unistring:
if counter == width:
ret+="\n"
counter=0
num = ord(char) % 32 + 9600 # 9600 to 9621 (BLOCK RANGE)
block+=unichr(num)
counter+=1
return block
def toPsuedoPhrase(alnumstring, chuck_sample=4):
word_chunks=list()
phrase = ""
for i in xrange(len(alnumstring)):
if i % chuck_sample != 0:
pass
else:
word_chunks.append(alnumstring[i:i+4])
for word in word_chunks:
suggestions = us_dict.suggest(word)
if suggestions:
for suggested_word in suggestions:
#generally, we dont want acronyms and possesives
if suggested_word.islower() or (suggested_word[0].isupper() and suggested_word[1:].islower()) and "'s" not in suggested_word:
phrase+=suggested_word
if len(suggested_word) > 2:
phrase+=" "
break
return phrase
def spellCheckPassThrough(word):
checked = ""
chunks = word.split(" ")
for chunk in chunks:
suggestions = us_dict.suggest(chunk)
if suggestions:
checked += suggestions[0] + " " # I manually remove spaces for twitter size sometimes
return checked
def unLEET(word):
word = word.replace("4","A")
word = word.replace("5","S")
word = word.replace("1","L")
word = word.replace("3","E")
word = word.replace("7","T")
word = word.replace("0","O")
return word
def main():
width = 20
raw_block = toBlockString(lol,width=width)
phrase = toAlnumString(lol)
print "ALNUM PASS:"+phrase
alnum_block = toBlockString(phrase,width=width)
phrase = unLEET(phrase)
print "UNLEET PASS:"+phrase
unleet_block = toBlockString(phrase,width=width)
phrase = toPsuedoPhrase(phrase, chuck_sample=3)
print "PHRASE PASS:"+phrase
phrase_block = toBlockString(phrase,width=width)
phrase = spellCheckPassThrough(phrase)
print "CORRECTED PASS:"+phrase
corrected_block = toBlockString(phrase,width=width)
print "Did you mean: "+phrase.rstrip()+"?"
strip = width*"#"
print strip
print raw_block
print strip
print unleet_block
print strip
print phrase_block
print strip
print corrected_block
print strip
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment