Skip to content

Instantly share code, notes, and snippets.

@jmnwong
Created May 1, 2014 20:07
Show Gist options
  • Save jmnwong/2688fbbe2a62ab5db195 to your computer and use it in GitHub Desktop.
Save jmnwong/2688fbbe2a62ab5db195 to your computer and use it in GitHub Desktop.
Repy GET Request
#begin include httpretrieve.repy
"""
<Program Name>
httpretrieve.repy
<Started>
August 19, 2009
<Authors>
Yafete Yemuru
Conrad Meyer
<Purpose>
Provides a method for retrieving content from web servers using the HTTP
protocol. The content can be accessed as a file like object, or saved to
a file or returned as a string.
"""
#begin include urlparse.repy
"""
<Program Name>
urlparse.repy
<Started>
May 15, 2009
<Author>
Michael Phan-Ba
<Purpose>
Provides utilities for parsing URLs, based on the Python 2.6.1 module urlparse.
"""
def urlparse_urlsplit(urlstring, default_scheme="", allow_fragments=True):
"""
<Purpose>
Parse a URL into five components, returning a dictionary. This corresponds
to the general structure of a URL:
scheme://netloc/path;parameters?query#fragment. The parameters are not
split from the URL and individual componenets are not separated.
Only absolute server-based URIs are currently supported (all URLs will be
parsed into the components listed, regardless of the scheme).
<Arguments>
default_scheme:
Optional: defaults to the empty string. If specified, gives the default
addressing scheme, to be used only if the URL does not specify one.
allow_fragments:
Optional: defaults to True. If False, fragment identifiers are not
allowed, even if the URL's addressing scheme normally does support them.
<Exceptions>
ValueError on parsing a non-numeric port value.
<Side Effects>
None.
<Returns>
A dictionary containing:
Key Value Value if not present
============================================================================
scheme URL scheme specifier empty string
netloc Network location part empty string
path Hierarchical path empty string
query Query component empty string
fragment Fragment identifier empty string
username User name None
password Password None
hostname Host name (lower case) None
port Port number as integer, if present None
"""
components = {"scheme": default_scheme, "netloc": "", "path": "", "query": "",
"fragment": "", "username": None, "password": None, "hostname": None,
"port": None }
# Extract the scheme, if present.
(lpart, rpart) = _urlparse_splitscheme(urlstring)
if lpart:
components["scheme"] = lpart
# Extract the server information, if present.
if rpart.startswith("//"):
(lpart, rpart) = _urlparse_splitnetloc(rpart, 2)
components["netloc"] = lpart
# Adds a trailing slash to the URL if no path exists.
if rpart == "":
rpart = "/"
(components["username"], components["password"], components["hostname"],
components["port"]) = _urlparse_splitauthority(lpart)
# Extract the fragment.
if allow_fragments:
(rpart, components["fragment"]) = _urlparse_splitfragment(rpart)
# Extract the query.
(components["path"], components["query"]) = _urlparse_splitquery(rpart)
return components
def _urlparse_splitscheme(url):
"""Parse the scheme portion of the URL"""
# The scheme is valid only if it contains these characters.
scheme_chars = \
"abcdefghijklmnopqrstuvwxyz0123456789+-."
scheme = ""
rest = url
spart = url.split(":", 1)
if len(spart) == 2:
# Normalize the scheme.
spart[0] = spart[0].lower()
# A scheme is valid only if it starts with an alpha character.
if spart[0] and spart[0][0].isalpha():
for char in spart[0]:
if char not in scheme_chars:
break
(scheme, rest) = spart
return scheme, rest
def _urlparse_splitnetloc(url, start=0):
"""Parse the netloc portion of the URL"""
# By default, the netloc is delimited by the end of the URL.
delim = len(url)
# Find the left-most delimiter.
for char in "/?#":
xdelim = url.find(char, start)
if xdelim >= 0:
delim = min(delim, xdelim)
# Return the netloc and the rest of the URL.
return url[start:delim], url[delim:]
def _urlparse_splitauthority(netloc):
"""Parse the authority portion of the netloc"""
# The authority can have a userinfo portion delimited by "@".
authority = netloc.split("@", 1)
# Default values.
username = None
password = None
hostname = None
port = None
# Is there a userinfo portion?
if len(authority) == 2:
# userinfo can be split into username:password
userinfo = authority[0].split(":", 1)
# hostport can be split into hostname:port
hostport = authority[1].split(":", 1)
if userinfo[0]:
username = userinfo[0]
if len(userinfo) == 2:
password = userinfo[1]
# No userinfo portion found.
else:
# hostport can be split into hostname:port
hostport = netloc.split(":", 1)
# Is there a port value?
if hostport[0]:
hostname = hostport[0]
if len(hostport) == 2:
port = int(hostport[1], 10)
# Return the values.
return username, password, hostname, port
def _urlparse_splitquery(url):
"""Parse the query portion of the url"""
qpart = url.split("?", 1)
if len(qpart) == 2:
query = qpart[1]
else:
query = ""
return qpart[0], query
def _urlparse_splitfragment(url):
"""Parse the query portion of the url"""
fpart = url.split("#", 1)
if len(fpart) == 2:
fragment = fpart[1]
else:
fragment = ""
return fpart[0], fragment
#end include urlparse.repy
#begin include sockettimeout.repy
"""
<Author>
Justin Cappos, Armon Dadgar
This is a rewrite of the previous version by Richard Jordan
<Start Date>
26 Aug 2009
<Description>
A library that causes sockets to timeout if a recv / send call would
block for more than an allotted amount of time.
"""
class SocketTimeoutError(Exception):
"""The socket timed out before receiving a response"""
class _timeout_socket():
"""
<Purpose>
Provides a socket like object which supports custom timeouts
for send() and recv().
"""
# Initialize with the socket object and a default timeout
def __init__(self,socket,timeout=10, checkintv='fibonacci'):
"""
<Purpose>
Initializes a timeout socket object.
<Arguments>
socket:
A socket like object to wrap. Must support send,recv,close, and willblock.
timeout:
The default timeout for send() and recv().
checkintv:
How often socket operations (send,recv) should check if
they can run. The smaller the interval the more time is
spent busy waiting.
"""
# Store the socket, timeout and check interval
self.socket = socket
self.timeout = timeout
self.checkintv = checkintv
# Allow changing the default timeout
def settimeout(self,timeout=10):
"""
<Purpose>
Allows changing the default timeout interval.
<Arguments>
timeout:
The new default timeout interval. Defaults to 10.
Use 0 for no timeout. Given in seconds.
"""
# Update
self.timeout = timeout
# Wrap willblock
def willblock(self):
"""
See socket.willblock()
"""
return self.socket.willblock()
# Wrap close
def close(self):
"""
See socket.close()
"""
return self.socket.close()
# Provide a recv() implementation
def recv(self,bytes,timeout=None):
"""
<Purpose>
Allows receiving data from the socket object with a custom timeout.
<Arguments>
bytes:
The maximum amount of bytes to read
timeout:
(Optional) Defaults to the value given at initialization, or by settimeout.
If provided, the socket operation will timeout after this amount of time (sec).
Use 0 for no timeout.
<Exceptions>
As with socket.recv(), socket.willblock(). Additionally, SocketTimeoutError is
raised if the operation times out.
<Returns>
The data received from the socket.
"""
# It's worth noting that this fibonacci backoff begins with a 2ms poll rate, and
# provides a simple exponential backoff scheme.
fibonacci_backoff = False
backoff_cap = 100 # Never use more than 100ms poll rate.
pre_value = 1.0 # Our iterators for Fibonacci sequence.
pre_pre_value = 1.0 #
# Since we want to be able to initialize with static poll rates (backwards
# compatibility) we specify a string if we're using the fibonacci backoff.
if type(self.checkintv) is str:
if self.checkintv == 'fibonacci':
fibonacci_backoff = True
# Set the timeout if None
if timeout is None:
timeout = self.timeout
# Get the start time
starttime = getruntime()
# Block until we can read
rblock, wblock = self.socket.willblock()
while rblock:
# Check if we should break
if timeout > 0:
# Get the elapsed time
diff = getruntime() - starttime
# Raise an exception
if diff > timeout:
raise SocketTimeoutError,"recv() timed out!"
if fibonacci_backoff:
# Iterate the sequence once
sleep_length = pre_value + pre_pre_value
pre_pre_value = pre_value
pre_value = sleep_length
# Make sure we don't exceed maximum backoff.
if sleep_length > backoff_cap:
sleep_length = backoff_cap
# Unit conversion to seconds
sleep_length = sleep_length / 1000.0
# Sleep
sleep(sleep_length)
else: # Classic functionality.
# Sleep
try:
sleep(float(self.checkintv))
except:
sleep(0.1)
# If available, move to the next value of checkintv.
# Update rblock
rblock, wblock = self.socket.willblock()
# Do the recv
return self.socket.recv(bytes)
# Provide a send() implementation
def send(self,data,timeout=None):
"""
<Purpose>
Allows sending data with the socket object with a custom timeout.
<Arguments>
data:
The data to send
timeout:
(Optional) Defaults to the value given at initialization, or by settimeout.
If provided, the socket operation will timeout after this amount of time (sec).
Use 0 for no timeout.
<Exceptions>
As with socket.send(), socket.willblock(). Additionally, SocketTimeoutError is
raised if the operation times out.
<Returns>
The number of bytes sent.
"""
# Set the timeout if None
if timeout is None:
timeout = self.timeout
# Get the start time
starttime = getruntime()
# Block until we can write
rblock, wblock = self.socket.willblock()
while wblock:
# Check if we should break
if timeout > 0:
# Get the elapsed time
diff = getruntime() - starttime
# Raise an exception
if diff > timeout:
raise SocketTimeoutError,"send() timed out!"
# Sleep
# Since switching to the fibonacci backoff, the nature of
# this field has changed. Rather than implement the backoff
# for checking block status (seems wasteful) we'll just use
# a constant value. Ten ms seems appropriate.
sleep(0.010)
# Update rblock
rblock, wblock = self.socket.willblock()
# Do the recv
return self.socket.send(data)
def timeout_openconn(desthost, destport, localip=None, localport=None, timeout=5):
"""
<Purpose>
Wrapper for openconn. Very, very similar
<Args>
Same as Repy openconn
<Exception>
Raises the same exceptions as openconn.
<Side Effects>
Creates a socket object for the user
<Returns>
socket obj on success
"""
realsocketlikeobject = openconn(desthost, destport, localip, localport, timeout)
thissocketlikeobject = _timeout_socket(realsocketlikeobject, timeout)
return thissocketlikeobject
def timeout_waitforconn(localip, localport, function, timeout=5):
"""
<Purpose>
Wrapper for waitforconn. Essentially does the same thing...
<Args>
Same as Repy waitforconn with the addition of a timeout argument.
<Exceptions>
Same as Repy waitforconn
<Side Effects>
Sets up event listener which calls function on messages.
<Returns>
Handle to listener.
"""
# We use a closure for the callback we pass to waitforconn so that we don't
# have to map mainch's to callback functions or deal with potential race
# conditions if we did maintain such a mapping.
def _timeout_waitforconn_callback(localip, localport, sockobj, ch, mainch):
# 'timeout' is the free variable 'timeout' that was the argument to
# timeout_waitforconn.
thissocketlikeobject = _timeout_socket(sockobj, timeout)
# 'function' is the free variable 'function' that was the argument to
# timeout_waitforconn.
return function(localip, localport, thissocketlikeobject, ch, mainch)
return waitforconn(localip, localport, _timeout_waitforconn_callback)
# a wrapper for stopcomm
def timeout_stopcomm(commhandle):
"""
Wrapper for stopcomm. Does the same thing...
"""
return stopcomm(commhandle)
#end include sockettimeout.repy
#begin include urllib.repy
def urllib_quote(inputstring, safestring="/"):
"""
<Purpose>
Encode an inputstring such that it can be used safely in a URL or XML
document.
<Arguments>
inputstring:
The string to urlencode.
safestring (optional):
Specifies additional characters that should not be quoted --
defaults to "/".
<Exceptions>
TypeError if the inputstring or safestring parameters aren't strings.
<Side Effects>
None.
<Returns>
Urlencoded version of the passed string.
"""
if type(inputstring) is not str:
raise TypeError("urllib_quote's inputstring parameter must be a string, not '"+str(type(inputstring))+"'")
if type(safestring) is not str:
raise TypeError("urllib_quote's safestring parameter must be a string, not '"+str(type(safestring))+"'")
resultstr = ""
# We go through each character in the string; if it's not in [0-9a-zA-Z]
# we wrap it.
safeset = set(safestring)
for char in inputstring:
asciicode = ord(char)
if (asciicode >= ord("0") and asciicode <= ord("9")) or \
(asciicode >= ord("A") and asciicode <= ord("Z")) or \
(asciicode >= ord("a") and asciicode <= ord("z")) or \
asciicode == ord("_") or asciicode == ord(".") or \
asciicode == ord("-") or char in safeset:
resultstr += char
else:
resultstr += "%%%02X" % asciicode
return resultstr
def urllib_quote_plus(inputstring, safestring=""):
"""
<Purpose>
Encode a string to go in the query fragment of a URL.
<Arguments>
inputstring:
The string to urlencode.
safestring (optional):
Specifies additional characters that should not be quoted --
defaults to the empty string.
<Exceptions>
TypeError if the inputstring or safestring parameters aren't strings.
<Side Effects>
None.
<Returns>
Urlencoded version of the passed string.
"""
if type(inputstring) is not str:
raise TypeError("urllib_quote_plus' inputstring parameter must be a string, not '"+str(type(inputstring))+"'")
if type(safestring) is not str:
raise TypeError("urllib_quote_plus' safestring parameter must be a string, not '"+str(type(safestring))+"'")
return urllib_quote(inputstring, safestring + " ").replace(" ", "+")
def urllib_unquote(inputstring):
"""
<Purpose>
Unquote a urlencoded string.
<Arguments>
inputstring:
The string to unquote.
<Exceptions>
TypeError if the inputstring isn't a string
ValueError thrown if the last wrapped octet isn't a valid wrapped octet
(i.e. if the string ends in "%" or "%x" rather than "%xx". Also throws
ValueError if the nibbles aren't valid hex digits.
<Side Effects>
None.
<Returns>
The decoded string.
"""
if type(inputstring) is not str:
raise TypeError("urllib_unquote's inputstring parameter must be a string, not '"+str(type(inputstring))+"'")
resultstr = ""
# We go through the inputstring from end to beginning, looking for wrapped
# octets. When one is found we add it (unwrapped) and the following
# string to the resultant string, and shorten the original inputstring.
while True:
lastpercentlocation = inputstring.rfind("%")
if lastpercentlocation < 0:
break
wrappedoctetstr = inputstring[lastpercentlocation+1:lastpercentlocation+3]
if len(wrappedoctetstr) != 2:
raise ValueError("Quoted string is poorly formed")
resultstr = \
chr(int(wrappedoctetstr, 16)) + \
inputstring[lastpercentlocation+3:] + \
resultstr
inputstring = inputstring[:lastpercentlocation]
resultstr = inputstring + resultstr
return resultstr
def urllib_unquote_plus(inputstring):
"""
<Purpose>
Unquote the urlencoded query fragment of a URL.
<Arguments>
inputstring:
The string to unquote.
<Exceptions>
TypeError if the inputstring isn't a string
ValueError thrown if the last wrapped octet isn't a valid wrapped octet
(i.e. if the inputstring ends in "%" or "%x" rather than "%xx". Also throws
ValueError if the nibbles aren't valid hex digits.
<Side Effects>
None.
<Returns>
The decoded string.
"""
if type(inputstring) is not str:
raise TypeError("urllib_unquote_plus' inputstring parameter must be a string, not '"+str(type(inputstring))+"'")
return urllib_unquote(inputstring.replace("+", " "))
def urllib_quote_parameters(inputdictionary):
"""
<Purpose>
Encode a dictionary of (key, value) pairs into an HTTP query string or
POST body (same form).
<Arguments>
dictionary:
The dictionary to quote.
<Exceptions>
TypeError if the inputdictionary isn't a dict.
<Side Effects>
None.
<Returns>
The quoted dictionary.
"""
if type(inputdictionary) is not dict:
raise TypeError("urllib_quote_parameters' inputstringdictionary parameter must be a dict, not '"+str(type(inputstring))+"'")
quoted_keyvals = []
for key, val in inputdictionary.items():
quoted_keyvals.append("%s=%s" % (urllib_quote(key), urllib_quote(val)))
return "&".join(quoted_keyvals)
def urllib_unquote_parameters(inputstring):
"""
<Purpose>
Decode a urlencoded query string or POST body.
<Arguments>
inputstring:
The string to decode.
<Exceptions>
TypeError if the inputstring isn't a string
ValueError if the inputstring is poorly formed.
<Side Effects>
None.
<Returns>
A dictionary mapping keys to values.
"""
if type(inputstring) is not str:
raise TypeError("urllib_unquote_parameters' inputstring parameter must be a string, not '"+str(type(inputstring))+"'")
keyvalpairs = inputstring.split("&")
res = {}
for quotedkeyval in keyvalpairs:
# Throw ValueError if there is more or less than one '='.
quotedkey, quotedval = quotedkeyval.split("=")
key = urllib_unquote_plus(quotedkey)
val = urllib_unquote_plus(quotedval)
res[key] = val
return res
#end include urllib.repy
class HttpConnectionError(Exception):
"""
Error indicating that the web server has unexpectedly dropped the
connection.
"""
class HttpBrokenServerError(Exception):
"""
Error indicating that the web server has sent us complete garbage instead
of something resembling HTTP.
"""
def httpretrieve_open(url, querydata=None, postdata=None,\
httpheaders=None, proxy=None, timeout=None):
"""
<Purpose>
Returns a file-like object that can be used to read the content from
an HTTP server. Follows 3xx redirects.
<Arguments>
url:
The URL to perform a GET or POST request on.
postdata (optional):
A dictionary of form data or a string to POST to the server.
Passing a non-None value results in a POST request being sent
to the server.
querydata (optional):
A dictionary of form data or a string to send as the query
string to the server.
If postdata is omitted, the URL is retrieved with GET. If
both postdata and querydata are omitted, there is no query
string sent in the request.
For both querydata and postdata, strings are sent *unmodified*.
This means you probably should encode them first, with
urllib_quote().
httpheaders (optional):
A dictionary of supplemental HTTP request headers to add to the
request.
proxy (optional):
A proxy server 2-tuple to bind to: ('host', port).
timeout (optional):
A timeout for establishing a connection to the web server,
sending headers, and reading the response headers.
If excluded or None, never times out.
<Exceptions>
ValueError if given an invalid URL, or malformed limit or timeout
values. This is also raised if the user attempts to call a method
on the file-like object after closing it.
HttpConnectionError if opening the connection fails, or if the
connection is closed by the server before we expect.
SocketTimeoutError if the timeout is exceeded.
HttpBrokenServerError if the response or the Location response header
is malformed.
<Side Effects>
None
<Returns>
Returns a file-like object which can be used to read the body of
the response from the web server. The protocol version spoken by the
server, status code, and response headers are available as members of
the object.
"""
starttimefloat = getruntime()
# Check if the URL is valid and get host, path, port and query
parsedurldict = urlparse_urlsplit(url)
hoststr = parsedurldict['hostname']
pathstr = parsedurldict['path']
portint = parsedurldict.get('port')
portint = portint or 80
if parsedurldict['scheme'] != 'http':
raise ValueError("URL doesn't seem to be for the HTTP protocol.")
if hoststr is None:
raise ValueError("Missing hostname.")
"""
#removed line---> parsedurldict['query'] is not None
#because it was broken
#if parsedurldict['query'] != "":
#raise ValueError("URL cannot include a query string.")
"""
# Typical HTTP sessions consist of (optionally, a series of pairs of) HTTP
# requests followed by HTTP responses. These happen serially.
# JAC: Set this up so that we can raise the right error if the
# timeout_openconn doesn't work.
sockobj = None
# Open connection to the web server
try:
if proxy is not None:
# if there is a proxy, open a connection with the proxy instead of the actual server
# use the timeout we are given (or none)
sockobj = timeout_openconn(proxy[0], proxy[1], timeout=timeout)
else:
# if there is no proxy open a connection with server directly
# use the timeout we are given (or none)
sockobj = timeout_openconn(hoststr, portint, timeout=timeout)
except Exception, e:
# If a socket object was created, we want to clean in up.
if sockobj:
sockobj.close()
if repr(e).startswith("timeout("):
raise HttpConnectionError("Socket timed out connecting to host/port.")
raise
try:
# Builds the HTTP request:
httprequeststr = _httpretrieve_build_request(hoststr, portint, pathstr, \
querydata, postdata, httpheaders, proxy)
# Send the full HTTP request to the web server.
_httpretrieve_sendall(sockobj, httprequeststr)
# Now, we're done with the HTTP request part of the session, and we need
# to get the HTTP response.
# Check if we've timed out (if the user requested a timeout); update the
# socket timeout to reflect the time taken sending the request.
if timeout is None:
sockobj.settimeout(0)
elif getruntime() - starttimefloat >= timeout:
raise SocketTimeoutError("Timed out")
else:
sockobj.settimeout(timeout - (getruntime() - starttimefloat))
# Receive the header lines from the web server (a series of CRLF-terminated
# lines, terminated by an empty line, or by the server closing the
# connection.
headersstr = ""
while not headersstr.endswith("\r\n\r\n"):
try:
# This should probably be replaced with page-sized reads in the future,
# but for now, the behavior is at least correct.
headersstr += sockobj.recv(1)
except Exception, e:
if str(e) == "Socket closed":
break
else:
raise
httpheaderlist = headersstr.split("\r\n")
# Ignore (a) trailing blank line(s) (for example, the response header-
# terminating blank line).
while len(httpheaderlist) > 0 and httpheaderlist[-1] == "":
httpheaderlist = httpheaderlist[:-1]
# Get the status code and status message from the HTTP response.
statuslinestr, httpheaderlist = httpheaderlist[0], httpheaderlist[1:]
# The status line should be in the form: "HTTP/1.X NNN SSSSS", where
# X is 0 or 1, NNN is a 3-digit status code, and SSSSS is a 'user-friendly'
# string representation of the status code (may contain spaces).
statuslinelist = statuslinestr.split(' ', 2)
if len(statuslinelist) < 3:
raise HttpBrokenServerError("Server returned garbage for HTTP " + \
"response (status line missing one or more fields).")
if not statuslinelist[0].startswith('HTTP'):
raise HttpBrokenServerError("Server returned garbage for HTTP " + \
"response (invalid response protocol in status line).")
friendlystatusstr = statuslinelist[2]
try:
statusint = int(statuslinelist[1])
except ValueError, e:
raise HttpBrokenServerError("Server returned garbage for HTTP " + \
"response (status code isn't integer).")
httpheaderdict = _httpretrieve_parse_responseheaders(httpheaderlist)
# If we got any sort of redirect response, follow the redirect. Note: we
# do *not* handle the 305 status code (use the proxy as specified in the
# Location header) at all; I think this is best handled at a higher layer
# anyway.
if statusint in (301, 302, 303, 307):
sockobj.close()
try:
redirecturlstr = httpheaderdict["Location"][0]
except (KeyError, IndexError), ke:
# When a server returns a redirect status code (3xx) but no Location
# header, some clients, e.g. Firefox, just show the response body
# as they would normally for a 2xx or 4xx response. So, I think we
# should ignore a missing Location header and just return the page
# to the caller.
pass
else:
# If the server did send a redirect location, let's go there.
return httpretrieve_open(redirecturlstr)
# If we weren't requested to redirect, and we didn't, return a read-only
# file-like object (representing the response body) to the caller.
return _httpretrieve_filelikeobject(sockobj, httpheaderdict, \
(statuslinelist[0], statusint, friendlystatusstr))
except:
# If any exception occured after the socket was open, we want to make
# sure that the socket is cleaned up if it is still open before we
# raise the exception.
if sockobj:
sockobj.close()
raise
def httpretrieve_save_file(url, filename, querydata=None, postdata=None, \
httpheaders=None, proxy=None, timeout=None):
"""
<Purpose>
Perform an HTTP request, and save the content of the response to a
file.
<Arguments>
filename:
The file name to save the response to.
Other arguments:
See documentation for httpretrieve_open().
<Exceptions>
This function will raise any exception raised by Repy file objects
in opening, writing to, and closing the file.
This function will all also raise any exception raised by
httpretrieve_open(), for the same reasons.
<Side Effects>
Writes the body of the response to 'filename'.
<Returns>
None
"""
# Open the output file object and http file-like object.
outfileobj = open(filename, 'w')
httpobj = httpretrieve_open(url, querydata=querydata, postdata=postdata, \
httpheaders=httpheaders, proxy=proxy, timeout=timeout)
# Repeatedly read from the file-like HTTP object into our file, until the
# response is finished.
responsechunkstr = None
while responsechunkstr != '':
responsechunkstr = httpobj.read(4096)
outfileobj.write(responsechunkstr)
outfileobj.close()
httpobj.close()
def httpretrieve_get_string(url, querydata=None, postdata=None, \
httpheaders=None, proxy=None, timeout=30):
"""
<Purpose>
Performs an HTTP request on the given URL, using POST or GET,
returning the content of the response as a string. Uses
httpretrieve_open.
<Arguments>
See httpretrieve_open.
<Exceptions>
See httpretrieve_open.
<Side Effects>
None.
<Returns>
Returns the body of the HTTP response (no headers).
"""
# Open a read-only file-like object for the HTTP request.
httpobj = httpretrieve_open(url, querydata=querydata, postdata=postdata, \
httpheaders=httpheaders, proxy=proxy, timeout=timeout)
# Read all of the response and return it.
try:
return httpobj.read()
finally:
httpobj.close()
class _httpretrieve_filelikeobject:
# This class implements a file-like object used for performing HTTP
# requests and retrieving responses.
def __init__(self, sock, headers, httpstatus):
# The socket-like object connected to the HTTP server. Headers have
# already been read.
self._sockobj = sock
# If this is set, the close() method has already been called, so we
# don't accept future reads.
self._fileobjclosed = False
# This flag is set if we've finished recieving the entire response
# from the server.
self._totalcontentisreceived = False
# This integer represents the number of bytes read so far.
self._totalread = 0
# This is the dictionary of HTTP response headers associated with this
# file-like object.
self.headers = headers
# The HTTP status tuple of this response, e.g. ("HTTP/1.0", 200, "OK")
self.httpstatus = httpstatus
def read(self, limit=None, timeout=None):
"""
<Purpose>
Behaves like Python's file.read(), with the potential to raise
additional informative exceptions.
<Arguments>
limit (optional):
The maximum amount of data to read. If omitted or None, this
reads all available data.
<Exceptions>
See file.read()'s documentation, as well as that of
httpretrieve_open().
<Side Effects>
None.
<Returns>
See file.read().
"""
# Raise an error if the caller has already close()d this object.
if self._fileobjclosed:
raise ValueError("I/O operation on closed file")
# If we've finished reading everything we can from the server, return the
# empty string.
if self._totalcontentisreceived:
return ''
lefttoread = None
if limit is not None:
lefttoread = limit
# Sanity check type/value of limit.
if type(limit) is not int:
raise TypeError("Expected an integer or None for read() limit")
elif limit < 0:
raise ValueError("Expected a non-negative integer for read() limit")
if timeout is None:
self._sockobj.settimeout(0)
else:
self._sockobj.settimeout(timeout)
# Try to read up to limit, or until there is nothing left.
httpcontentstr = ''
while True:
try:
contentchunkstr = self._sockobj.recv(lefttoread or 4096)
except Exception, e:
if str(e) == "Socket closed":
self._totalcontentisreceived = True
break
else:
raise
httpcontentstr += contentchunkstr
self._totalread += len(contentchunkstr)
if limit is not None:
if len(contentchunkstr) == lefttoread:
break
else:
lefttoread -= len(contentchunkstr)
if contentchunkstr == "":
self._totalcontentisreceived = True
break
return httpcontentstr
def close(self):
"""
<Purpose>
Close the file-like object.
<Arguments>
None
<Exceptions>
None
<Side Effects>
Disconnects from the HTTP server.
<Returns>
Nothing
"""
self._fileobjclosed = True
self._sockobj.close()
def _httpserver_put_in_headerdict(res, lastheader, lastheader_str):
# Helper function that tries to put the header into a dictionary of lists,
# 'res'.
if lastheader is not None:
if lastheader not in res:
res[lastheader] = []
res[lastheader].append(lastheader_str.strip())
def _httpretrieve_parse_responseheaders(headerlines):
# Parse rfc822-style headers (this could be abstracted out to an rfc822
# library that would be quite useful for internet protocols). Returns
# a dictionary mapping headers to arrays of values. E.g.:
#
# Foo: a
# Bar:
# b
# Bar: c
#
# Becomes: {"Foo": ["a"], "Bar": ["b", "c"]}
# These variables represent the key and value of the last header we found,
# unless we are parsing the very first header. E.g., if we've just read:
# Content-Type: text/html
# Then, lastheaderkeystr == "Content-Type",
# lastheadervaluestr == "text/html"
lastheaderkeystr = None
lastheadervaluestr = ""
resdict = {}
if len(headerlines) == 0:
return {}
try:
# Iterate over the request header lines:
for i in range(len(headerlines)):
# Lines with leading non-CRLF whitespace characters are part of the
# previous line (see rfc822 for details).
if headerlines[i][0] in (" ", "\t") and lastheaderkeystr is not None:
lastheadervaluestr += headerlines[i]
else:
_httpserver_put_in_headerdict(resdict, lastheaderkeystr, lastheadervaluestr)
lastheaderkeystr, lastheadervaluestr = headerlines[i].split(":", 1)
# Add the last line to the result dictionary.
_httpserver_put_in_headerdict(resdict, lastheaderkeystr, lastheadervaluestr)
return resdict
except IndexError, idx:
raise HttpBrokenServerError("Server returned garbage for HTTP" + \
" response. Bad headers.")
def _httpretrieve_build_request(host, port, path, querydata, postdata, \
httpheaders, proxy):
# Builds an HTTP request from these parameters, returning it as
# a string.
# Sanity checks:
if path == "":
raise ValueError("Invalid path -- empty string.")
if postdata is not None and type(postdata) not in (str, dict):
raise TypeError("Postdata should be a dict of form-data or a string")
if querydata is not None and type(querydata) not in (str, dict):
raise TypeError("Querydata should be a dict of form-data or a string")
if httpheaders is not None and type(httpheaders) is not dict:
raise TypeError("Expected HTTP headers as a dictionary.")
# Type-conversions:
if type(querydata) is dict:
querydata = urllib_quote_parameters(querydata)
elif querydata is None:
querydata = ""
if type(postdata) is dict:
postdata = urllib_quote_parameters(postdata)
# Default to GET, unless the caller specifies a message body to send.
methodstr = "GET"
if postdata is not None:
methodstr = "POST"
# Encode the path and querystring part of the request.
resourcestr = querydata
if querydata != "":
resourcestr = "?" + resourcestr
# Encode the HTTP request line and headers:
if proxy is not None:
# proxy exists thus the request header should include the original requested url
requeststr = methodstr + ' http://' + host + ':' + str(port) + path + resourcestr + ' HTTP/1.0\r\n'
else:
# there is no proxy; send normal http request
requeststr = methodstr + ' ' + path + resourcestr + ' HTTP/1.0\r\n'
if httpheaders is not None:
# Most servers require a 'Host' header for normal functionality
# (especially in the case of multiple domains being hosted on a
# single server).
if "Host" not in httpheaders:
requeststr += "Host: " + host + ':' + str(port) + "\r\n"
for key, val in httpheaders.items():
requeststr += key + ": " + val + '\r\n'
# Affix post-data related headers and content:
if methodstr == "POST":
requeststr += 'Content-Length: ' + str(len(postdata)) + '\r\n'
# The empty line terminates HTTP headers.
requeststr += '\r\n'
# If we're a POST request, affix any requested data to the message body.
if methodstr == "POST":
requeststr += postdata
return requeststr
def _httpretrieve_sendall(sockobj, datastr):
# Helper function that attempts to dump all of the data in datastr to the
# socket sockobj (data is any arbitrary bytes).
while len(datastr) > 0:
datastr = datastr[sockobj.send(datastr):]
#end include httpretrieve.repy
if callfunc == 'initialize':
# print "bing"
# result = httpretrieve_get_string("http://www.bing.com")
headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/html"}
url = "http://www.bing.com"
print url
print urlparse_urlsplit(url)
result = httpretrieve_open(url, httpheaders=headers)
print '----->', result
resultfile = open("index.html", "w")
resultfile.write(result.read())
resultfile.close()
result.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment