Skip to content

Instantly share code, notes, and snippets.

@carpedm20
Created August 14, 2014 09:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save carpedm20/01ec79d88e5cf941ddb0 to your computer and use it in GitHub Desktop.
Save carpedm20/01ec79d88e5cf941ddb0 to your computer and use it in GitHub Desktop.
babo
#-*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
tree = ET.parse('dict-ko-galkwi.xml')
root = tree.getroot()
texts = [i.text for i in root.iter('word')]
poses = [i.text for i in root.iter('pos')]
zipped_texts = zip(texts, poses)
nouns = [i[0].encode('utf-8') for i in zipped_texts if i[1] == u'명사']
print len(nouns)
with open("nouns.csv", "wb") as f:
for i in nouns:
f.write(i+"\n")
chars = []
for noun in nouns:
chars.extend(list(noun.decode('utf-8')))
chars = list(set(chars))
with open("chars.csv", "wb") as f:
for i in chars:
f.write(i.encode('utf-8')+"\n")
import requests
import json
START = 20
END = None
chars = []
with open('chars.csv','r') as f:
for line in f:
chars.append(line[:-1])
if not END:
END = len(chars)
print "[*] from %s to %s" % (START, END)
try:
with open('dict.json','r') as f:
dict_list = json.loads(f.read())
except:
dict_list = []
for i in range(START, END):
print "[%s/%s] %s" % (i, len(chars), chars[i])
data = {'char': chars[i]}
r = requests.post('URL', data=data)
j = json.loads(r.text)
try:
dict_list.extend(j['data'])
print " => %s" % len(j['data'])
except:
print "[!] Error: %s" % chars[i]
with open('dict.json','wb') as f:
json.dump(dict_list, f)
import json
with open('dict.json','r') as f:
j = json.loads(f.read())
words = [i['word'] for i in j]
compact_words = list(set(words))
with open('words.json','wb') as f:
json.dump(compact_words, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment