Skip to content

Instantly share code, notes, and snippets.

@guanix
Created March 8, 2011 19:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save guanix/860790 to your computer and use it in GitHub Desktop.
Save guanix/860790 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Ricardo Garcia Gonzalez
# Author: Danny Colligan
# Author: Benjamin Johnson
# Author: Vasyl' Vavrychuk
# Author: Witold Baryluk
# Author: Paweł Paprota
# Author: Guan Yang
# Author: David Triendl
# License: Public domain code
# The total amount of time it should take to fetch one timestamp, in seconds
TIME=1.1
import datetime
import email.utils
import htmlentitydefs
import os, sys, time
import timeit
import re
import socket
import urllib
import urllib2
import tempfile
import httplib
class YahooException(Exception):
def __init__(self, value):
self.parameter = value
def __str__(self):
return repr(self.parameter)
class HeadRequest(urllib2.Request):
def get_method(self):
return "HEAD"
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a Unicode character.
This function receives a match object and is intended to be used with
the re.sub() function.
"""
entity = matchobj.group(1)
# Known non-numeric HTML entity
if entity in htmlentitydefs.name2codepoint:
return unichr(htmlentitydefs.name2codepoint[entity])
# Unicode character
mobj = re.match(ur'(?u)#(x?\d+)', entity)
if mobj is not None:
numstr = mobj.group(1)
if numstr.startswith(u'x'):
base = 16
numstr = u'0%s' % numstr
else:
base = 10
return unichr(long(numstr, base))
# Unknown entity in name, return its literal representation
return (u'&%s;' % entity)
def timeconvert(timestr):
"""Convert RFC 2822 defined time string into system timestamp"""
timestamp = None
timetuple = email.utils.parsedate_tz(timestr)
if timetuple is not None:
timestamp = email.utils.mktime_tz(timetuple)
return timestamp
def get_cdn_url(video_id):
"""Returns the CDN URL for a given video ID.
Fetches some magic Yahoo! Video site to get the super-secret CDN URL. Code
shamelessly stolen from youtube-dl.
"""
yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'
yv_bitrate = '700'
yv_video_height = '200'
yv_video_width = '200'
request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +'&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height + '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
try:
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
raise YahooException('Could not fetch playlist file')
mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
if mobj is None:
raise YahooException('Could not parse playlist file')
video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
return video_url
def get_timestamp(cdn_url):
"""Returns the timestamp of a video file.
Gives HEAD to Yahoo to get the timestamp for a video file and returns it as
integer in seconds since the beginning of the Unix epoch.
"""
request = HeadRequest(cdn_url)
try:
result = urllib2.urlopen(request)
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
raise YahooException('Failed to fetch information from CDN')
timestring = result.info().get('last-modified')
if timestring == None:
raise YahooException('Yahoo failed to mention when the file was last-modified.')
timestamp = timeconvert(timestring)
if timestamp == None:
raise YahooException('Stupid Yahoo, why you no learn how to use a calendar?')
return int(timestamp)
def fetch_timestamp(video_id):
cdn_url = get_cdn_url(video_id)
return get_timestamp(cdn_url)
# rebind our socket, if no IP is specified use default
true_socket = socket.socket
def bound_socket(*a, **k):
sock = true_socket(*a, **k)
ip = os.environ.get('BIND')
if not ip: ip = "0.0.0.0"
sock.bind((ip, 0))
return sock
socket.socket = bound_socket
if len(sys.argv) != 2:
sys.stderr.write("./timestamp.py missing-timestamps.txt\n")
sys.exit(1)
ro = re.compile(r"^(\d+) (\d+)")
fd = open(sys.argv[1], "r")
for line in fd:
t1 = time.clock()
mo = ro.match(line)
if not mo: continue
user_id = mo.group(1)
video_id = mo.group(2)
ts = fetch_timestamp(video_id)
sys.stdout.write("%s %s %d\n" % (user_id, video_id, ts))
sys.stdout.flush()
sys.stderr.write("timestamp for video %s was %d. " % (video_id, ts))
t2 = time.clock()
delta = t2 - t1
if delta < TIME:
slp = TIME - delta
else:
slp = 0.1
sys.stderr.write("took %0.4f s, sleeping for %0.4f s\n" % (delta, slp))
time.sleep(TIME-delta)
fd.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment