Skip to content

Instantly share code, notes, and snippets.

@andreiolariu
Created November 18, 2011 17:25
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andreiolariu/1377118 to your computer and use it in GitHub Desktop.
Save andreiolariu/1377118 to your computer and use it in GitHub Desktop.
Markov model based on youtube comments
# for more info check out http://webmining.olariu.org/interview-with-a-lady-gaga-fan
# made to be run in the ipython console
import urllib, urllib2, time, random
import simplejson as json
def fetch_url(url, get=None, post=None):
user_agent = 'Andrei Olariu\'s Web Mining for Dummies'
headers = {'User-Agent': user_agent}
if get:
data = urllib.urlencode(get)
url = "%s?%s" % (url, data)
req = urllib2.Request(url, post, headers)
try:
response = urllib2.urlopen(req).read()
response = json.loads(response)
except Exception, e:
print 'error in reading %s: %s' % (url, e)
return None
return response
# fetch comments for a youtube video (given a video id) by doing repeated
# api calls (one call returnes up to 50 comments)
def fetch_comments(yid, maxcount=1000):
url = 'http://gdata.youtube.com/feeds/api/videos/%s/comments' % yid
COUNT = 50
values = {
'alt': 'json',
'max-results': COUNT,
}
results = []
for i in range(1, maxcount, COUNT):
values['start-index'] = i
data = fetch_url(url, get=values)
if data and 'feed' in data and 'entry' in data['feed'] and \
len(data['feed']['entry']) > 0:
results.extend([c['content']['$t'] for c in data['feed']['entry']])
else:
break
time.sleep(0.1)
return results
# builds the markov model
# every state is defined as a pair of words
# state (word[k], word[k+1]) depends on state (word[k-1], word[k])
def add_to_markov(markov, words):
if len(words) < 3:
return
if words[0] not in markov:
markov[words[0]] = {}
if words[1] not in markov[words[0]]:
markov[words[0]][words[1]] = {}
if words[2] not in markov[words[0]][words[1]]:
markov[words[0]][words[1]][words[2]] = 0
markov[words[0]][words[1]][words[2]] += 1
add_to_markov(markov, words[1:])
# given a state (aka a pair of words (word1, word2)),
# find the next state (aka another pair of words (word2, word3))
def get_next(markov, word1, word2):
if word1 not in markov or word2 not in markov[word1]:
return None
total = sum([c for c in markov[word1][word2].itervalues()])
choose = random.randint(1, total)
for w, c in markov[word1][word2].iteritems():
choose -= c
if choose <= 0:
return w
# given a starting state, find future states in a recursive way
def get_phrase(markov, word1, word2, limit=50):
if limit == 0:
return ''
word3 = get_next(markov, word1, word2)
if not word3:
return ''
return '%s %s' % (word3, get_phrase(markov, word2, word3, limit - 1))
# given a sentence beginning, add words using the markov model
# the starting state is given by the last 2 words in the sentence,
# all other words are not used
def talk(markov, start):
words = re.findall(r'\w+', start.lower())
if len(words) < 2:
return None
return '%s %s' % (start, get_phrase(markov, words[-2], words[-1]))
# get comments for a video
yid = 'UzxYlbK2c7E' # machine learning
# use 'qrO4YZeyl0I' for lady gaga
comments = fetch_comments(yid)
# split comments in phrases
texts = []
r = re.compile("[.!?;]")
for c in comments:
for line in c.splitlines():
texts.extend(r.split(line))
# split phrases into words and build the markov model
markov = {}
for t in texts:
remove_first = t.startswith('@') # remove usernames
t = t.lower()
words = re.findall(r'\w+', t)
if remove_first:
words = words[1:]
add_to_markov(markov, words)
# have fun
print talk(markov, 'i like')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment