Skip to content

Instantly share code, notes, and snippets.

@taruta811
Created April 1, 2012 13:42
Show Gist options
  • Save taruta811/2275376 to your computer and use it in GitHub Desktop.
Save taruta811/2275376 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# vim: fileencoding=utf-8
from BeautifulSoup import BeautifulSoup
import re
import urllib
import urllib2
import unicodedata
from collections import *
appid = "各自のID" # アプリケーションID
pageurl = "http://jlp.yahooapis.jp/MAService/V1/parse"
# Yahoo!形態素解析の結果をリストで返す
# http://developer.yahoo.co.jp/webapi/jlp/ma/v1/parse.html
def morph(sentence, appid=appid, results="ma", filter="1|2|3|4|5|6|7|8|9|10|11|12|13"):
sentence = sentence.encode("utf-8")
params = urllib.urlencode({'appid':appid, 'results':results, 'filter':filter, 'sentence':sentence})
c = urllib2.urlopen(pageurl, params) # POSTで投稿
soup = BeautifulSoup(c.read())
return [w.surface.string for w in soup.ma_result.word_list]
def main():
yomou_URL = "http://yomou.syosetu.com/rank/list/type/daily_total/"
niji_URL = "http://nizisosaku.com/rank/list/type/daily_total/"
response = BeautifulSoup(urllib.urlopen(yomou_URL).read())
descriptions = map(str,response.findAll("div",attrs={"class":"ex"}))
desc = []
for d in descriptions:
desc.append(unicodedata.normalize("NFKC",re.sub("&quot;","\"",re.sub(" |(<.*div.*>)","",d)).decode("utf-8")))
result = Counter()
for d in desc:
result += Counter(morph(d,appid=appid,filter=9))
for (item,count) in result.most_common(100):
print item+"=>"+str(count)
# JP_TOKEN = re.compile(u"[一-龠]+|[ぁ-ん]+|[ァ-ヴ]+|[a-zA-Z0-9]+")
# counter = Counter(JP_TOKEN.findall(desc))
# for (item,count) in counter.most_common(100):
# print item+"=>"+str(count)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment