Skip to content

Instantly share code, notes, and snippets.

Created March 8, 2011 19:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save guanix/860790 to your computer and use it in GitHub Desktop.
Save guanix/860790 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Ricardo Garcia Gonzalez
# Author: Danny Colligan
# Author: Benjamin Johnson
# Author: Vasyl' Vavrychuk
# Author: Witold Baryluk
# Author: Paweł Paprota
# Author: Guan Yang
# Author: David Triendl
# License: Public domain code
# The total amount of time it should take to fetch one timestamp, in seconds
import datetime
import email.utils
import htmlentitydefs
import os, sys, time
import timeit
import re
import socket
import urllib
import urllib2
import tempfile
import httplib
class YahooException(Exception):
def __init__(self, value):
self.parameter = value
def __str__(self):
return repr(self.parameter)
class HeadRequest(urllib2.Request):
def get_method(self):
return "HEAD"
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a Unicode character.
This function receives a match object and is intended to be used with
the re.sub() function.
entity =
# Known non-numeric HTML entity
if entity in htmlentitydefs.name2codepoint:
return unichr(htmlentitydefs.name2codepoint[entity])
# Unicode character
mobj = re.match(ur'(?u)#(x?\d+)', entity)
if mobj is not None:
numstr =
if numstr.startswith(u'x'):
base = 16
numstr = u'0%s' % numstr
base = 10
return unichr(long(numstr, base))
# Unknown entity in name, return its literal representation
return (u'&%s;' % entity)
def timeconvert(timestr):
"""Convert RFC 2822 defined time string into system timestamp"""
timestamp = None
timetuple = email.utils.parsedate_tz(timestr)
if timetuple is not None:
timestamp = email.utils.mktime_tz(timetuple)
return timestamp
def get_cdn_url(video_id):
"""Returns the CDN URL for a given video ID.
Fetches some magic Yahoo! Video site to get the super-secret CDN URL. Code
shamelessly stolen from youtube-dl.
yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'
yv_bitrate = '700'
yv_video_height = '200'
yv_video_width = '200'
request = urllib2.Request('' + video_id +'&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height + '&vidW=' + yv_video_width + '&swf=as3&,v2,&eventid=1301797')
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
raise YahooException('Could not fetch playlist file')
mobj ='<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
if mobj is None:
raise YahooException('Could not parse playlist file')
video_url = urllib.unquote( +'utf-8')
video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
return video_url
def get_timestamp(cdn_url):
"""Returns the timestamp of a video file.
Gives HEAD to Yahoo to get the timestamp for a video file and returns it as
integer in seconds since the beginning of the Unix epoch.
request = HeadRequest(cdn_url)
result = urllib2.urlopen(request)
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
raise YahooException('Failed to fetch information from CDN')
timestring ='last-modified')
if timestring == None:
raise YahooException('Yahoo failed to mention when the file was last-modified.')
timestamp = timeconvert(timestring)
if timestamp == None:
raise YahooException('Stupid Yahoo, why you no learn how to use a calendar?')
return int(timestamp)
def fetch_timestamp(video_id):
cdn_url = get_cdn_url(video_id)
return get_timestamp(cdn_url)
# rebind our socket, if no IP is specified use default
true_socket = socket.socket
def bound_socket(*a, **k):
sock = true_socket(*a, **k)
ip = os.environ.get('BIND')
if not ip: ip = ""
sock.bind((ip, 0))
return sock
socket.socket = bound_socket
if len(sys.argv) != 2:
sys.stderr.write("./ missing-timestamps.txt\n")
ro = re.compile(r"^(\d+) (\d+)")
fd = open(sys.argv[1], "r")
for line in fd:
t1 = time.clock()
mo = ro.match(line)
if not mo: continue
user_id =
video_id =
ts = fetch_timestamp(video_id)
sys.stdout.write("%s %s %d\n" % (user_id, video_id, ts))
sys.stderr.write("timestamp for video %s was %d. " % (video_id, ts))
t2 = time.clock()
delta = t2 - t1
if delta < TIME:
slp = TIME - delta
slp = 0.1
sys.stderr.write("took %0.4f s, sleeping for %0.4f s\n" % (delta, slp))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment