Skip to content

Instantly share code, notes, and snippets.

@vim13
Created September 5, 2010 21:39
Show Gist options
  • Save vim13/566347 to your computer and use it in GitHub Desktop.
Save vim13/566347 to your computer and use it in GitHub Desktop.
Yahoo!形態素解析+Yahoo!知恵袋+マルコフ連鎖
#!/usr/lib/python
#vim:fileencoding=utf-8
import re
import random
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup, NavigableString, Declaration, Comment
class markovTweet:
def __init__(self, api, appid):
self.appid = appid
self.api = api
def getNavigableStrings(self,soup):
if isinstance(soup, NavigableString):
if type(soup) not in (Comment, Declaration) and soup.strip():
yield soup
elif soup.name not in ('script', 'style'):
for c in soup.contents:
for g in self.getNavigableStrings(c):
yield g
def getTweets(self):
pageurl = 'http://list.chiebukuro.yahoo.co.jp/dir/list.php?dnum=2078297371&flg=3&agree=1'
html = urllib2.urlopen(pageurl)
soup = html.read()
question = re.search('question_detail/q[0-9]+', soup).group(0)
html = urllib2.urlopen('http://ksrd.yahoo.co.jp/PAGE=d2078297371/LOC=QUELIST/R=1/O=TITLE/ST=all/*-http://detail.chiebukuro.yahoo.co.jp/qa/' + question)
soup = BeautifulSoup(html).find("div",{"class":"qa"})
text = ("".join(self.getNavigableStrings(soup))).encode('utf-8')
return re.sub('([_a-z0-9]+)さん|\n', '', text).strip()
def yahooParse(self,tweets):
pageurl = 'http://jlp.yahooapis.jp/MAService/V1/parse'
results = 'ma'
my_filter = '1|2|3|4|5|6|7|8|9|10|11|12|13'
params = urllib.urlencode({'appid':self.appid, 'results':results, 'filter':my_filter, 'sentence':tweets})
html = urllib2.urlopen(pageurl, params)
return html
def makeMarkov(self,wordlist):
markov = {}
w1 = ""
w2 = ""
for word in wordlist:
if w1 and w2:
markov[(w1, w2)] = word
w1, w2 = w2, word
markov[(w1, w2)] = ""
wo1, wo2 = wordlist[0], wordlist[1]
sentence = wo1 + wo2
for n in markov:
if not markov[(wo1,wo2)]:
break
else:
tmp = markov[(wo1, wo2)]
wo1, wo2 = wo2, tmp
sentence += tmp
return sentence.strip()
def main(self):
text = self.getTweets()
html = self.yahooParse(text)
soup = BeautifulSoup(html)
wordlist = [(w.surface.string).encode('utf-8') for w in soup.ma_result.word_list]
sentence = self.makeMarkov(wordlist)
return sentence
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment