Skip to content

Instantly share code, notes, and snippets.

@bbengfort
Last active December 19, 2015 02:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bbengfort/5885901 to your computer and use it in GitHub Desktop.
Save bbengfort/5885901 to your computer and use it in GitHub Desktop.
An example showing how to fetch multiple links from the same host with the same, open httplib.HTTPConnection object. The real question is what happens when you send a Connection:Close header.
#!/usr/bin/env python
# multiconn.py
#
# Author: Benjamin Bengfort <benjamin@bengfort.com>
# $ID: multiconn.py [1] benjamin@bengfort.com $
"""
An example showing how to fetch multiple links from the same host with the
same, open httplib.HTTPConnection object. This is the basis for an attempt
at conncurrent page fetching. The real question is what happens when you
send a "Connection: Close" header along with the request.
"""
###########################################################################
## Imports
###########################################################################
from bs4 import BeautifulSoup
from datetime import datetime
from httplib import HTTPConnection
from urlparse import urlparse, urlunparse
###########################################################################
## The HTTP Client that can make multiple requests
###########################################################################
class MultiRequestConnection(object):
"""
Instantiate this host with the netloc (in urlparse terms) and then
open and close the connection with `connect` and `close` respectively.
Open connections can issue `GET` requests with that method, or you can
use `fetch` to get all the first level links from the given path.
"""
def __init__(self, host):
self.conn = None
self.host = host
def connect(self):
"""
Opens the connection to the host.
"""
self.conn = HTTPConnection(self.host)
print "Connection to %s opened" % self.host
def close(self):
"""
Closes the connection to the host and resets state.
"""
self.conn.close()
self.conn = None
print "Connection to %s closed" % self.host
def GET(self, path, body='', headers={}):
"""
Issue a GET request to the path with the body and headers.
"""
if self.conn is None:
raise Exception("Host not connected")
self.conn.request("GET", path, body, headers)
response = self.conn.getresponse()
self.log(path, 'GET', response)
return response.read()
def log(self, path, method, response):
"""
Kind of like Apache's Common Log format.
"""
format = {
'host': self.host,
'time': datetime.now().strftime("%d/%b/%Y:%H:%M:%S %z"),
'method': method,
'path': path,
'version': 'HTTP/1.1',
'status': response.status,
'reason': response.reason,
}
print '%(host)s [%(time)s] "%(method)s %(path)s %(version)s" %(status)s %(reason)s' % format
def links_from_response(self, response):
"""
Fetches all the first level links from an html document, where the
links are part of the same host as the open connection (e.g.
relative links or links that start with the same netloc).
"""
soup = BeautifulSoup(response)
for link in soup.find_all('a'):
href = link.get('href')
href = urlparse(href)
if href.netloc in (self.host, ''):
yield urlunparse(('', '', href.path or '/', href.params, href.query, ''))
def fetch(self, path='/'):
"""
Performs a GET request on the given path, parses out all the first
level links in the same host, and then fetches those as well.
Nothing is returned, but this can easily be overrided in
subclasses.
"""
if self.conn is None: self.connect()
response = self.GET(path)
for link in self.links_from_response(response):
self.GET(link)
self.close()
###########################################################################
## Testing in the Main Method
###########################################################################
if __name__ == "__main__":
client = MultiRequestConnection("www.cobrain.com")
client.fetch()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment