Skip to content

Instantly share code, notes, and snippets.

@fqrouter
Last active February 3, 2024 06:11
Show Gist options
  • Star 34 You must be signed in to star a gist
  • Fork 20 You must be signed in to fork a gist
  • Save fqrouter/9602380 to your computer and use it in GitHub Desktop.
Save fqrouter/9602380 to your computer and use it in GitHub Desktop.
Youtube Reverse Proxy
resolver 8.8.8.8;
location /video/ {
if ($request_uri ~ "^/video/(.+?)/.+") {
set $upstream_host $1.googlevideo.com;
add_header Content-Disposition "attachment; filename=video.mp4;";
}
rewrite /video/.+?/(.+)$ /$1 break;
proxy_buffering off;
proxy_pass https://$upstream_host;
proxy_set_header Host $upstream_host;
}
location /image/ {
if ($request_uri ~ "^/image/(.+?)/.+") {
set $upstream_host $1.ytimg.com;
}
rewrite /image/.+?/(.+)$ /$1 break;
proxy_buffering off;
proxy_pass http://$upstream_host;
proxy_set_header Host $upstream_host;
}
location /photo/ {
if ($request_uri ~ "^/photo/(.+?)/.+") {
set $upstream_host $1.ggpht.com;
}
rewrite /photo/.+?/(.+)$ /$1 break;
proxy_buffering off;
proxy_pass http://$upstream_host;
proxy_set_header Host $upstream_host;
}
#!/usr/bin/env python
import logging
import httplib
import os
import subprocess
import socket
import datetime
import random
import signal
import urllib2
import urlparse
import urllib
import re
from gevent.wsgi import WSGIServer
import gevent.monkey
import gevent.pool
import gevent
import redis
import cgi
import functools
import Cookie
REDIS = redis.StrictRedis()
gevent.monkey.patch_all(subprocess=True)
proc_pool = gevent.pool.Pool(size=16)
LOGGER = logging.getLogger(__name__)
WORKERS = {
} # you need to fill this
LISTEN_IP = ''
LISTEN_PORT = 3000
RE_YTIMG_CSS = re.compile(r'/s.ytimg\.com(.*?\.css)', re.IGNORECASE)
RE_YTIMG_ESC = re.compile(r'\\/([a-zA-Z0-9-]+?)\.ytimg\.com', re.IGNORECASE)
RE_YTIMG = re.compile(r'/([a-zA-Z0-9-]+?)\.ytimg\.com', re.IGNORECASE)
RE_GGPHT = re.compile(r'https://([a-zA-Z0-9-]+?)\.ggpht\.com', re.IGNORECASE)
RE_GOOGLEVIDEO = re.compile(r'/([a-zA-Z0-9-]+?)\.googlevideo\.com', re.IGNORECASE)
def handle_request(environ, start_response):
method = environ.get('REQUEST_METHOD')
try:
lines = handle(environ, lambda status, headers: start_response(get_http_response(status), headers))
except:
path = environ.get('PATH_INFO', '').strip('/')
LOGGER.exception('failed to handle request: %s %s' % (method, path))
start_response('500 INTERNAL_SERVER_ERROR', [
('Content-Type', 'text/javascript'),
('Cache-Control', 'no-cache, no-store, must-revalidate'),
('Pragma', 'no-cache'),
('Expires', '0')])
lines = ['Retry in 30 minutes']
for line in lines:
yield line
def get_http_response(code):
if code not in httplib.responses:
return code
return '%s %s' % (code, httplib.responses[code])
def replace_ytimg_css(match):
return '/your-reverse-proxy-ip/css/%s' % match.group(1)
def replace_ytimg_esc(match):
return '\\/%s\\/image/%s' % (pick_worker()[0], match.group(1))
def replace_ytimg(match):
return '/%s/image/%s' % (pick_worker()[0], match.group(1))
def replace_ggpht(match):
return 'http://%s/photo/%s' % (pick_worker()[0], match.group(1))
def replace_googlevideo(worker, match):
return '/%s/video/%s' % (worker[0], match.group(1))
def handle(environ, start_response):
host = 'youtube.com'
path = environ.get('PATH_INFO', '')
if '/watch' == path:
video_id = urlparse.parse_qs(environ['QUERY_STRING'])['v'][0]
return handle_watch(video_id, environ, start_response)
if '/watch_videos' == path:
video_id = urlparse.parse_qs(environ['QUERY_STRING'])['video_ids'][0].split(',')[0]
return handle_watch(video_id, environ, start_response)
if path.startswith('/css/'):
upstream_url = path.replace('/css/', '')
upstream_url = 'http://s.ytimg.com/%s' % upstream_url
return handle_css(upstream_url, environ, start_response)
if path.startswith('/t/'):
domain = path.replace('/t/', '').replace('.js', '')
words = 'window.location.href="http://%s";' % domain
start_response(httplib.OK, [
('Content-Type', 'text/javascript'),
('Cache-Control', 'no-cache, no-store, must-revalidate'),
('Pragma', 'no-cache'),
('Expires', '0')])
return [words]
if path.startswith('//'):
start_response(httplib.FOUND, [
('Location', 'http:%s' % path)
])
return []
data = None
if 'POST' == environ['REQUEST_METHOD']:
if '/results' == path:
post_body = cgi.FieldStorage(
fp=environ['wsgi.input'],
environ=environ,
keep_blank_values=True)
upstream_url = 'http://youtube.com/results?%s' % urllib.urlencode({'search_query': post_body['search_query'].value})
else:
data = environ['wsgi.input'].readline()
upstream_url = 'http://%s%s' % (host, path)
else:
upstream_url = 'http://%s%s' % (host, path)
if environ['QUERY_STRING']:
upstream_url = '%s?%s' % (upstream_url, environ['QUERY_STRING'])
LOGGER.info('upstream url: %s' % upstream_url)
headers = {}
if environ.get('HTTP_COOKIE'):
LOGGER.info('cookie is: %s' % environ.get('HTTP_COOKIE'))
headers['Cookie'] = environ.get('HTTP_COOKIE')
try:
response = urllib2.urlopen(urllib2.Request(upstream_url, data=data, headers=headers))
except urllib2.HTTPError as e:
start_response(e.code, [(k, v) for k, v in e.hdrs.items()])
return [e.msg]
except:
raise
headers = []
for k, v in response.headers.items():
if 'set-cookie' == k.lower():
v = v.replace('domain=.youtube.com;', '')
if 'x-frame' in k.lower():
continue
headers.append((k, v))
start_response(httplib.OK, headers)
body = response.read()
body = RE_YTIMG_CSS.sub(replace_ytimg_css, body)
body = RE_YTIMG_ESC.sub(replace_ytimg_esc, body)
body = RE_YTIMG.sub(replace_ytimg, body)
body = RE_GGPHT.sub(replace_ggpht, body)
# body = body.replace('class="search-form', 'method="POST" class="search-form')
body = body.replace('class="video-masthead">', 'style="display: none;">')
body = body.replace('class="branded-page-v2-top-row">', 'style="display: none;">')
body = body.replace('style="z-index: 1">', 'style="display: none;">')
body = body.replace('style="z-index: 1;">', 'style="display: none;">')
body = body.replace('class="premium-yva-unexpanded"', 'style="display: none;"')
# body = body.replace('id="masthead-search"', 'style="position: relative; padding: 0; margin-top: 3px; overflow: hidden;"')
body = body.replace('ad.doubleclick.net', '127.0.0.1')
body = body.replace('www.youtube.com', 'your-reverse-proxy-ip')
return [body]
def handle_css(upstream_url, environ, start_response):
response = urllib2.urlopen(urllib2.Request(upstream_url))
headers = []
for k, v in response.headers.items():
if 'set-cookie' == k.lower():
v = v.replace('domain=.youtube.com;', '')
headers.append((k, v))
start_response(httplib.OK, headers)
body = response.read()
body = RE_YTIMG.sub(replace_ytimg, body)
return [body]
def handle_watch(video_id, environ, start_response):
video_url = REDIS.get(video_id)
if video_url:
LOGGER.info('%s hit cache' % video_id)
else:
LOGGER.info('get url for movie: %s' % video_id)
try:
video_url = proc_pool.spawn(get_url, video_id).get()
except:
LOGGER.exception('failed to get url')
start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
return ['no valid url']
if 'googlevideo.com' not in video_url:
LOGGER.error('googlevideo.com not in url: %s' % video_url)
start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
return ['no valid url']
video_url = video_url.replace('https://', 'http://')
history = set()
success = False
for i in range(3):
worker = pick_worker(history)
try_url = RE_GOOGLEVIDEO.sub(functools.partial(replace_googlevideo, worker), video_url)
if is_url_correct(try_url):
video_url = try_url
success = True
break
# else:
# worker[1] = False
if not success:
start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
return ['no valid url']
REDIS.set(video_id, video_url)
REDIS.expire(video_id, 60 * 3)
LOGGER.info('got url for movie: %s %s' % (video_id, video_url))
start_response(httplib.FOUND, [
('Location', video_url),
('Content-Type', 'text/plain'),
('Cache-Control', 'max-age=180')
])
return ['you can use this link to download the movie']
def get_url(video_id):
if '/' in video_id:
raise Exception('evil')
return subprocess.check_output(
'youtube-dl http://www.youtube.com/watch?v=%s -g'
% video_id, shell=True).strip()
def serve_forever():
try:
server = WSGIServer((LISTEN_IP, LISTEN_PORT), handle_request)
LOGGER.info('serving HTTP on port %s:%s...' % (LISTEN_IP, LISTEN_PORT))
except:
LOGGER.exception('failed to start HTTP server on port %s:%s' % (LISTEN_IP, LISTEN_PORT))
os._exit(1)
server.serve_forever()
def pick_worker(history=()):
if len(history) >= len(WORKERS):
raise Exception('no worker')
server_name = random.choice(WORKERS.keys())
worker = random.choice(WORKERS[server_name])
if not worker[1]:
return pick_worker(set(list(history) + [server_name]))
return worker
def is_url_correct(url):
class NoRedirectHandler(urllib2.HTTPRedirectHandler):
def http_error_302(self, req, fp, code, msg, headers):
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
infourl.status = code
infourl.code = code
return infourl
http_error_300 = http_error_302
http_error_301 = http_error_302
http_error_303 = http_error_302
http_error_307 = http_error_302
try:
opener = urllib2.build_opener(NoRedirectHandler())
response = opener.open(url)
response.close()
if 200 == response.code:
return True
else:
LOGGER.error('status code %s for url %s' % (response.code, url))
return False
except:
LOGGER.exception('try url failed: %s' % url)
return False
def refresh_workers():
while True:
for workers in WORKERS.values():
for worker in workers:
worker[1] = is_worker_alive(worker[0])
LOGGER.info('%s refreshed workers' % datetime.datetime.now())
gevent.sleep(60 * 60)
def is_worker_alive(worker_host):
try:
urllib2.urlopen('http://%s/image/i1/vi/tLcfAnN2QgY/mqdefault.jpg' % worker_host, timeout=3).close()
LOGGER.info('%s => OK' % worker_host)
return True
except:
LOGGER.info('%s => FAILURE' % worker_host)
return False
signal.signal(signal.SIGINT, lambda signum, fame: os._exit(0))
logging.basicConfig(level=logging.DEBUG)
gevent.spawn(refresh_workers)
serve_forever()
@x2c3z4
Copy link

x2c3z4 commented May 10, 2016

下面python是个代理吧,转到上面那个nginx?
现在能用么?

@liruqi
Copy link

liruqi commented Nov 29, 2017

WORKERS 里面的配置什么格式?your-reverse-proxy-ip 可以通过配置读取吗?

@rozdemir
Copy link

rozdemir commented Oct 4, 2018

hi , What is workers format ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment