Last active
December 19, 2015 02:49
-
-
Save bbengfort/5885901 to your computer and use it in GitHub Desktop.
An example showing how to fetch multiple links from the same host with the same, open httplib.HTTPConnection object. The real question is what happens when you send a Connection:Close header.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# multiconn.py | |
# | |
# Author: Benjamin Bengfort <benjamin@bengfort.com> | |
# $ID: multiconn.py [1] benjamin@bengfort.com $ | |
""" | |
An example showing how to fetch multiple links from the same host with the | |
same, open httplib.HTTPConnection object. This is the basis for an attempt | |
at conncurrent page fetching. The real question is what happens when you | |
send a "Connection: Close" header along with the request. | |
""" | |
########################################################################### | |
## Imports | |
########################################################################### | |
from bs4 import BeautifulSoup | |
from datetime import datetime | |
from httplib import HTTPConnection | |
from urlparse import urlparse, urlunparse | |
########################################################################### | |
## The HTTP Client that can make multiple requests | |
########################################################################### | |
class MultiRequestConnection(object): | |
""" | |
Instantiate this host with the netloc (in urlparse terms) and then | |
open and close the connection with `connect` and `close` respectively. | |
Open connections can issue `GET` requests with that method, or you can | |
use `fetch` to get all the first level links from the given path. | |
""" | |
def __init__(self, host): | |
self.conn = None | |
self.host = host | |
def connect(self): | |
""" | |
Opens the connection to the host. | |
""" | |
self.conn = HTTPConnection(self.host) | |
print "Connection to %s opened" % self.host | |
def close(self): | |
""" | |
Closes the connection to the host and resets state. | |
""" | |
self.conn.close() | |
self.conn = None | |
print "Connection to %s closed" % self.host | |
def GET(self, path, body='', headers={}): | |
""" | |
Issue a GET request to the path with the body and headers. | |
""" | |
if self.conn is None: | |
raise Exception("Host not connected") | |
self.conn.request("GET", path, body, headers) | |
response = self.conn.getresponse() | |
self.log(path, 'GET', response) | |
return response.read() | |
def log(self, path, method, response): | |
""" | |
Kind of like Apache's Common Log format. | |
""" | |
format = { | |
'host': self.host, | |
'time': datetime.now().strftime("%d/%b/%Y:%H:%M:%S %z"), | |
'method': method, | |
'path': path, | |
'version': 'HTTP/1.1', | |
'status': response.status, | |
'reason': response.reason, | |
} | |
print '%(host)s [%(time)s] "%(method)s %(path)s %(version)s" %(status)s %(reason)s' % format | |
def links_from_response(self, response): | |
""" | |
Fetches all the first level links from an html document, where the | |
links are part of the same host as the open connection (e.g. | |
relative links or links that start with the same netloc). | |
""" | |
soup = BeautifulSoup(response) | |
for link in soup.find_all('a'): | |
href = link.get('href') | |
href = urlparse(href) | |
if href.netloc in (self.host, ''): | |
yield urlunparse(('', '', href.path or '/', href.params, href.query, '')) | |
def fetch(self, path='/'): | |
""" | |
Performs a GET request on the given path, parses out all the first | |
level links in the same host, and then fetches those as well. | |
Nothing is returned, but this can easily be overrided in | |
subclasses. | |
""" | |
if self.conn is None: self.connect() | |
response = self.GET(path) | |
for link in self.links_from_response(response): | |
self.GET(link) | |
self.close() | |
########################################################################### | |
## Testing in the Main Method | |
########################################################################### | |
if __name__ == "__main__": | |
client = MultiRequestConnection("www.cobrain.com") | |
client.fetch() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment