Last active
February 8, 2018 04:09
-
-
Save nixjdm/4309e5c20486256223bad2ff21c40ef5 to your computer and use it in GitHub Desktop.
A simple proxy server you run locally. A newer and better version of this can be found at https://github.com/terminal-labs/simple-proxy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# | |
# This will route a webpage and all of it's included src and hrefs | |
# calls through this proxy. | |
# | |
# This is meant to be used in a virtualenv with pip installed packages: | |
# tornado beautifulsoup4 requests lxml | |
# Note: the proxy terminal output will display errors related to | |
# urllib3 and SSL unless python >= 2.7.9 is used | |
# | |
# Usage (URL Format): [ip address]:[port]/[website address] | |
# e.g. 192.168.0.2:8000/www.example.com/file/path?query | |
from urlparse import urlparse | |
from tornado import ioloop, web, template | |
from tornado.httputil import HTTPHeaders | |
from bs4 import BeautifulSoup | |
import requests | |
PORT = 9001 # change me if you want | |
class MainHandler(web.RequestHandler): | |
def __init__(self, *args, **kwargs): | |
super(MainHandler, self).__init__(*args, **kwargs) | |
url = self.request.uri[1:] # strip the preceding forward slash | |
if 'http' not in url: | |
url = 'http://' + url | |
self.host = urlparse(url).scheme + '://' + urlparse(url).netloc | |
self.proxy = self.request.protocol + '://' + self.request.host | |
r = requests.get(url) | |
if "html" in r.headers['content-type']: | |
soup = BeautifulSoup(r.text, 'lxml') # lxml - don't correct any messed up html | |
for asset in soup.find_all(['img', 'script', 'link']): | |
# if asset.has_attr('src'): # e.g. inside <script> and <img> tags | |
# attr = 'src' | |
# self.url_fix(asset, attr) | |
# elif asset.has_attr('href'): # e.g. inside <link> tags | |
if asset.has_attr('href'): # e.g. inside <link> tags | |
attr = 'href' | |
self.url_fix(asset, attr) | |
else: # no attrs need fixing | |
attr = None | |
self.data = soup.prettify() # soup ingested and parsed html. Urls modified. | |
else: | |
self.data = r.text # unparsed raw data (css, js, png, ...). All urls unmodified. | |
# critical to have resource files interpreted correctly | |
self.set_header('content-type', r.headers['content-type']) | |
def url_fix(self, asset, attr): | |
''' | |
Take self, assets (the html element), and their attrs (src or href), | |
and set the corrected attr by making all links absolute and run them | |
through this proxy. | |
''' | |
if not urlparse(asset[attr]).hostname: # relative url - make it absolute | |
asset[attr] = '{0}{1}'.format(self.host, asset[attr]) | |
elif asset[attr][0:2] == '//': # protocol relatives prefixed with '//' | |
asset[attr] = self.request.protocol + '://' + asset[attr][2:] | |
asset[attr] = '{0}/{1}'.format(self.proxy, asset[attr]) # proxify all urls | |
return | |
def get(self): | |
self.write(self.data) | |
def make_app(): | |
return web.Application([ | |
(r"^.*", MainHandler), | |
]) | |
if __name__ == "__main__": | |
app = make_app() | |
app.listen(PORT, address="0.0.0.0") | |
ioloop.IOLoop.current().start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment