Skip to content

Instantly share code, notes, and snippets.

@tosh1ki
Created February 15, 2014 16:05
Show Gist options
  • Save tosh1ki/9021358 to your computer and use it in GitHub Desktop.
Save tosh1ki/9021358 to your computer and use it in GitHub Desktop.
MeCabとYahoo!の比較
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import MeCab
import operator
import re
import pdb
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
def parse_mecab(text=''):
mecab = MeCab.Tagger('mecabrc')
node = mecab.parseToNode(text)
words = {}
while node:
word = node.surface
if node.posid >= 36 and node.posid <= 67:
if not words.has_key(word):
words[word] = 0
words[word] += 1
node = node.next
word_items = words.items()
word_items.sort(key=operator.itemgetter(1),reverse=True)
return word_items
def morph(sentence, appid, results="ma", filter="1|2|3|4|5|6|7|8|9|10|11|12|13"):
pageurl = "http://jlp.yahooapis.jp/MAService/V1/parse"
params = urllib.urlencode({'appid':appid, 'results':results, 'filter':filter, 'sentence':sentence})
c = urllib2.urlopen(pageurl, params)
soup = BeautifulSoup(c.read())
return [w.surface.string for w in soup.ma_result.word_list]
def parse_yahoo(text):
APPID = ''
result = morph(text, appid=APPID, filter="9")
words = {}
for word in result:
if not words.has_key(word):
words[word] = 1
else:
words[word] += 1
word_items = words.items()
word_items.sort(key=operator.itemgetter(1),reverse=True)
return word_items
def main():
f = open('tokyo.txt','r')
str = f.read()
str_u = unicode(str,'shift_jis')
text = str_u.encode('utf-8')
## output to md
for word,count in parse_mecab(text):
print ' '*4, word, count
print '-'*10
for word,count in parse_yahoo(text):
print ' '*4, word, count
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment