andreiolariu/mm_youtube.py

## mm_youtube.py
# for more info check out http://webmining.olariu.org/interview-with-a-lady-gaga-fan
# made to be run in the ipython console

import urllib, urllib2, time, random
import simplejson as json

def fetch_url(url, get=None, post=None):
    user_agent = 'Andrei Olariu\'s Web Mining for Dummies'
    headers = {'User-Agent': user_agent}
    if get:
        data = urllib.urlencode(get)
        url = "%s?%s" % (url, data)
    req = urllib2.Request(url, post, headers)
    try:
        response = urllib2.urlopen(req).read()
        response = json.loads(response)
    except Exception, e:
        print 'error in reading %s: %s' % (url, e)
        return None
    return response

# fetch comments for a youtube video (given a video id) by doing repeated
# api calls (one call returnes up to 50 comments)
def fetch_comments(yid, maxcount=1000):
    url = 'http://gdata.youtube.com/feeds/api/videos/%s/comments' % yid
    COUNT = 50
    values = {
            'alt': 'json',
            'max-results': COUNT,
    }
    results = []
    for i in range(1, maxcount, COUNT):
        values['start-index'] = i
        data = fetch_url(url, get=values)
        if data and 'feed' in data and 'entry' in data['feed'] and \
                len(data['feed']['entry']) > 0:
            results.extend([c['content']['$t'] for c in data['feed']['entry']])
        else:
            break
        time.sleep(0.1)
    return results

# builds the markov model
# every state is defined as a pair of words
# state (word[k], word[k+1]) depends on state (word[k-1], word[k])
def add_to_markov(markov, words):
    if len(words) < 3:
        return
    if words[0] not in markov:
        markov[words[0]] = {}
    if words[1] not in markov[words[0]]:
        markov[words[0]][words[1]] = {}
    if words[2] not in markov[words[0]][words[1]]:
        markov[words[0]][words[1]][words[2]] = 0
    markov[words[0]][words[1]][words[2]] += 1
    add_to_markov(markov, words[1:])

# given a state (aka a pair of words (word1, word2)),
# find the next state (aka another pair of words (word2, word3))
def get_next(markov, word1, word2):
    if word1 not in markov or word2 not in markov[word1]:
        return None
    total = sum([c for c in markov[word1][word2].itervalues()])
    choose = random.randint(1, total)
    for w, c in markov[word1][word2].iteritems():
        choose -= c
        if choose <= 0:
            return w

# given a starting state, find future states in a recursive way
def get_phrase(markov, word1, word2, limit=50):
    if limit == 0:
        return ''
    word3 = get_next(markov, word1, word2)
    if not word3:
        return ''
    return '%s %s' % (word3, get_phrase(markov, word2, word3, limit - 1))

# given a sentence beginning, add words using the markov model
# the starting state is given by the last 2 words in the sentence,
# all other words are not used
def talk(markov, start):
    words = re.findall(r'\w+', start.lower())
    if len(words) < 2:
        return None
    return '%s %s' % (start, get_phrase(markov, words[-2], words[-1]))

# get comments for a video
yid = 'UzxYlbK2c7E' # machine learning
# use 'qrO4YZeyl0I' for lady gaga
comments = fetch_comments(yid)

# split comments in phrases
texts = []
r = re.compile("[.!?;]")
for c in comments:
    for line in c.splitlines():
        texts.extend(r.split(line))

# split phrases into words and build the markov model
markov = {}
for t in texts:
    remove_first = t.startswith('@') # remove usernames
    t = t.lower()
    words = re.findall(r'\w+', t)
    if remove_first:
        words = words[1:]
    add_to_markov(markov, words)

# have fun
print talk(markov, 'i like')
	# for more info check out http://webmining.olariu.org/interview-with-a-lady-gaga-fan
	# made to be run in the ipython console

	import urllib, urllib2, time, random
	import simplejson as json

	def fetch_url(url, get=None, post=None):
	user_agent = 'Andrei Olariu\'s Web Mining for Dummies'
	headers = {'User-Agent': user_agent}
	if get:
	data = urllib.urlencode(get)
	url = "%s?%s" % (url, data)
	req = urllib2.Request(url, post, headers)
	try:
	response = urllib2.urlopen(req).read()
	response = json.loads(response)
	except Exception, e:
	print 'error in reading %s: %s' % (url, e)
	return None
	return response

	# fetch comments for a youtube video (given a video id) by doing repeated
	# api calls (one call returnes up to 50 comments)
	def fetch_comments(yid, maxcount=1000):
	url = 'http://gdata.youtube.com/feeds/api/videos/%s/comments' % yid
	COUNT = 50
	values = {
	'alt': 'json',
	'max-results': COUNT,
	}
	results = []
	for i in range(1, maxcount, COUNT):
	values['start-index'] = i
	data = fetch_url(url, get=values)
	if data and 'feed' in data and 'entry' in data['feed'] and \
	len(data['feed']['entry']) > 0:
	results.extend([c['content']['$t'] for c in data['feed']['entry']])
	else:
	break
	time.sleep(0.1)
	return results

	# builds the markov model
	# every state is defined as a pair of words
	# state (word[k], word[k+1]) depends on state (word[k-1], word[k])
	def add_to_markov(markov, words):
	if len(words) < 3:
	return
	if words[0] not in markov:
	markov[words[0]] = {}
	if words[1] not in markov[words[0]]:
	markov[words[0]][words[1]] = {}
	if words[2] not in markov[words[0]][words[1]]:
	markov[words[0]][words[1]][words[2]] = 0
	markov[words[0]][words[1]][words[2]] += 1
	add_to_markov(markov, words[1:])

	# given a state (aka a pair of words (word1, word2)),
	# find the next state (aka another pair of words (word2, word3))
	def get_next(markov, word1, word2):
	if word1 not in markov or word2 not in markov[word1]:
	return None
	total = sum([c for c in markov[word1][word2].itervalues()])
	choose = random.randint(1, total)
	for w, c in markov[word1][word2].iteritems():
	choose -= c
	if choose <= 0:
	return w

	# given a starting state, find future states in a recursive way
	def get_phrase(markov, word1, word2, limit=50):
	if limit == 0:
	return ''
	word3 = get_next(markov, word1, word2)
	if not word3:
	return ''
	return '%s %s' % (word3, get_phrase(markov, word2, word3, limit - 1))

	# given a sentence beginning, add words using the markov model
	# the starting state is given by the last 2 words in the sentence,
	# all other words are not used
	def talk(markov, start):
	words = re.findall(r'\w+', start.lower())
	if len(words) < 2:
	return None
	return '%s %s' % (start, get_phrase(markov, words[-2], words[-1]))

	# get comments for a video
	yid = 'UzxYlbK2c7E' # machine learning
	# use 'qrO4YZeyl0I' for lady gaga
	comments = fetch_comments(yid)

	# split comments in phrases
	texts = []
	r = re.compile("[.!?;]")
	for c in comments:
	for line in c.splitlines():
	texts.extend(r.split(line))

	# split phrases into words and build the markov model
	markov = {}
	for t in texts:
	remove_first = t.startswith('@') # remove usernames
	t = t.lower()
	words = re.findall(r'\w+', t)
	if remove_first:
	words = words[1:]
	add_to_markov(markov, words)

	# have fun
	print talk(markov, 'i like')