Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Markov model based on youtube comments
# for more info check out http://webmining.olariu.org/interview-with-a-lady-gaga-fan
# made to be run in the ipython console
import urllib, urllib2, time, random
import simplejson as json
def fetch_url(url, get=None, post=None):
user_agent = 'Andrei Olariu\'s Web Mining for Dummies'
headers = {'User-Agent': user_agent}
if get:
data = urllib.urlencode(get)
url = "%s?%s" % (url, data)
req = urllib2.Request(url, post, headers)
try:
response = urllib2.urlopen(req).read()
response = json.loads(response)
except Exception, e:
print 'error in reading %s: %s' % (url, e)
return None
return response
# fetch comments for a youtube video (given a video id) by doing repeated
# api calls (one call returnes up to 50 comments)
def fetch_comments(yid, maxcount=1000):
url = 'http://gdata.youtube.com/feeds/api/videos/%s/comments' % yid
COUNT = 50
values = {
'alt': 'json',
'max-results': COUNT,
}
results = []
for i in range(1, maxcount, COUNT):
values['start-index'] = i
data = fetch_url(url, get=values)
if data and 'feed' in data and 'entry' in data['feed'] and \
len(data['feed']['entry']) > 0:
results.extend([c['content']['$t'] for c in data['feed']['entry']])
else:
break
time.sleep(0.1)
return results
# builds the markov model
# every state is defined as a pair of words
# state (word[k], word[k+1]) depends on state (word[k-1], word[k])
def add_to_markov(markov, words):
if len(words) < 3:
return
if words[0] not in markov:
markov[words[0]] = {}
if words[1] not in markov[words[0]]:
markov[words[0]][words[1]] = {}
if words[2] not in markov[words[0]][words[1]]:
markov[words[0]][words[1]][words[2]] = 0
markov[words[0]][words[1]][words[2]] += 1
add_to_markov(markov, words[1:])
# given a state (aka a pair of words (word1, word2)),
# find the next state (aka another pair of words (word2, word3))
def get_next(markov, word1, word2):
if word1 not in markov or word2 not in markov[word1]:
return None
total = sum([c for c in markov[word1][word2].itervalues()])
choose = random.randint(1, total)
for w, c in markov[word1][word2].iteritems():
choose -= c
if choose <= 0:
return w
# given a starting state, find future states in a recursive way
def get_phrase(markov, word1, word2, limit=50):
if limit == 0:
return ''
word3 = get_next(markov, word1, word2)
if not word3:
return ''
return '%s %s' % (word3, get_phrase(markov, word2, word3, limit - 1))
# given a sentence beginning, add words using the markov model
# the starting state is given by the last 2 words in the sentence,
# all other words are not used
def talk(markov, start):
words = re.findall(r'\w+', start.lower())
if len(words) < 2:
return None
return '%s %s' % (start, get_phrase(markov, words[-2], words[-1]))
# get comments for a video
yid = 'UzxYlbK2c7E' # machine learning
# use 'qrO4YZeyl0I' for lady gaga
comments = fetch_comments(yid)
# split comments in phrases
texts = []
r = re.compile("[.!?;]")
for c in comments:
for line in c.splitlines():
texts.extend(r.split(line))
# split phrases into words and build the markov model
markov = {}
for t in texts:
remove_first = t.startswith('@') # remove usernames
t = t.lower()
words = re.findall(r'\w+', t)
if remove_first:
words = words[1:]
add_to_markov(markov, words)
# have fun
print talk(markov, 'i like')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.