Created
June 4, 2010 02:43
-
-
Save k-saka/424857 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
#anime.py | |
import urllib2, MeCab | |
from BeautifulSoup import BeautifulSoup | |
from math import sqrt | |
import clusters | |
url = 'http://ja.wikipedia.org/wiki/%E6%A9%9F%E5%8B%95%E8%AD%A6%E5%AF%9F%E3%83%91%E3%83%88%E3%83%AC%E3%82%A4%E3%83%90%E3%83%BC%E3%81%AE%E7%99%BB%E5%A0%B4%E4%BA%BA%E7%89%A9' | |
word_list = [] | |
def get_html(): | |
opener = urllib2.build_opener() | |
opener.addheaders = [('User-agent', 'Myapp')] | |
html = opener.open(url).read() | |
return html | |
def get_character_text(html): | |
character_text = {} | |
soup = BeautifulSoup(html) | |
dl_tags = soup.findAll('dl') | |
for dl in dl_tags: | |
for dt in dl.findAll('dt'): | |
string = '' | |
for dd in dl.findAll('dd'): | |
string += dd.renderContents() | |
character_text[dt.renderContents()] = string | |
return character_text | |
def get_wordcounts(text): | |
wc = {} | |
m = MeCab.Tagger('-Ochasen') | |
n = m.parseToNode(text) | |
n = n.next | |
while n.next: | |
if (38 <= n.posid and n.posid <= 47) and n.posid != 40 and n.char_type != 5: | |
word = n.surface | |
wc.setdefault(word,0) | |
wc[word]+=1 | |
if word not in word_list: word_list.append(word) | |
n = n.next | |
return wc | |
def get_matrix(character_data): | |
# word1 word2 word3... | |
#chara1 data[0][0] data[0][1] data[0][2] | |
#chara2 data[1][0] .... | |
#chara3 ... | |
#... | |
# character_list is row name | |
character_list = [chara for chara in character_data] | |
data_list = [] | |
for chara in character_list: | |
data_list.append(character_data[chara]) | |
return character_list,data_list | |
def main(): | |
character_text = get_character_text(get_html()) | |
#create character_words | |
character_words = {} | |
for i in character_text: | |
character_words[i] = get_wordcounts(character_text[i]) | |
#create character_data | |
character_data = {} | |
for chara,wc in character_words.items(): | |
character_data[chara] = [] | |
for word in word_list: | |
if word in wc: | |
character_data[chara].append(float(wc[word])) | |
else: | |
character_data[chara].append(float(0)) | |
character_list,data_list = get_matrix(character_data) | |
clust = clusters.hcluster(data_list) | |
clusters.drawdendrogram(clust,character_list,jpeg='anime.jpeg') | |
coords = clusters.scaledown(data_list) | |
clusters.draw2d(coords,character_list) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment