Skip to content

Instantly share code, notes, and snippets.

@k-saka
Created June 4, 2010 02:43
Show Gist options
  • Save k-saka/424857 to your computer and use it in GitHub Desktop.
Save k-saka/424857 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
#anime.py
import urllib2, MeCab
from BeautifulSoup import BeautifulSoup
from math import sqrt
import clusters
url = 'http://ja.wikipedia.org/wiki/%E6%A9%9F%E5%8B%95%E8%AD%A6%E5%AF%9F%E3%83%91%E3%83%88%E3%83%AC%E3%82%A4%E3%83%90%E3%83%BC%E3%81%AE%E7%99%BB%E5%A0%B4%E4%BA%BA%E7%89%A9'
word_list = []
def get_html():
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Myapp')]
html = opener.open(url).read()
return html
def get_character_text(html):
character_text = {}
soup = BeautifulSoup(html)
dl_tags = soup.findAll('dl')
for dl in dl_tags:
for dt in dl.findAll('dt'):
string = ''
for dd in dl.findAll('dd'):
string += dd.renderContents()
character_text[dt.renderContents()] = string
return character_text
def get_wordcounts(text):
wc = {}
m = MeCab.Tagger('-Ochasen')
n = m.parseToNode(text)
n = n.next
while n.next:
if (38 <= n.posid and n.posid <= 47) and n.posid != 40 and n.char_type != 5:
word = n.surface
wc.setdefault(word,0)
wc[word]+=1
if word not in word_list: word_list.append(word)
n = n.next
return wc
def get_matrix(character_data):
# word1 word2 word3...
#chara1 data[0][0] data[0][1] data[0][2]
#chara2 data[1][0] ....
#chara3 ...
#...
# character_list is row name
character_list = [chara for chara in character_data]
data_list = []
for chara in character_list:
data_list.append(character_data[chara])
return character_list,data_list
def main():
character_text = get_character_text(get_html())
#create character_words
character_words = {}
for i in character_text:
character_words[i] = get_wordcounts(character_text[i])
#create character_data
character_data = {}
for chara,wc in character_words.items():
character_data[chara] = []
for word in word_list:
if word in wc:
character_data[chara].append(float(wc[word]))
else:
character_data[chara].append(float(0))
character_list,data_list = get_matrix(character_data)
clust = clusters.hcluster(data_list)
clusters.drawdendrogram(clust,character_list,jpeg='anime.jpeg')
coords = clusters.scaledown(data_list)
clusters.draw2d(coords,character_list)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment