jmnwong/gist:2688fbbe2a62ab5db195

## gistfile1.py
#begin include httpretrieve.repy
"""
<Program Name>
  httpretrieve.repy

<Started>
  August 19, 2009

<Authors>
  Yafete Yemuru
  Conrad Meyer

<Purpose>
  Provides a method for retrieving content from web servers using the HTTP
  protocol. The content can be accessed as a file like object, or saved to
  a file or returned as a string.
"""


#begin include urlparse.repy
"""
<Program Name>
  urlparse.repy

<Started>
  May 15, 2009

<Author>
  Michael Phan-Ba

<Purpose>
  Provides utilities for parsing URLs, based on the Python 2.6.1 module urlparse.

"""


def urlparse_urlsplit(urlstring, default_scheme="", allow_fragments=True):
  """
  <Purpose>
    Parse a URL into five components, returning a dictionary.  This corresponds
    to the general structure of a URL:
    scheme://netloc/path;parameters?query#fragment.  The parameters are not
    split from the URL and individual componenets are not separated.

    Only absolute server-based URIs are currently supported (all URLs will be
    parsed into the components listed, regardless of the scheme).

  <Arguments>
    default_scheme:
      Optional: defaults to the empty string.  If specified, gives the default
      addressing scheme, to be used only if the URL does not specify one.

    allow_fragments:
      Optional: defaults to True.  If False, fragment identifiers are not
      allowed, even if the URL's addressing scheme normally does support them.

  <Exceptions>
    ValueError on parsing a non-numeric port value.

  <Side Effects>
    None.

  <Returns>
    A dictionary containing:

    Key         Value                               Value if not present
    ============================================================================
    scheme      URL scheme specifier                empty string
    netloc      Network location part               empty string
    path        Hierarchical path                   empty string
    query       Query component                     empty string
    fragment    Fragment identifier                 empty string
    username    User name                           None
    password    Password                            None
    hostname    Host name (lower case)              None
    port        Port number as integer, if present  None

  """

  components = {"scheme": default_scheme, "netloc": "", "path": "", "query": "",
    "fragment": "", "username": None, "password": None, "hostname": None,
    "port": None }

  # Extract the scheme, if present.
  (lpart, rpart) = _urlparse_splitscheme(urlstring)
  if lpart:
    components["scheme"] = lpart

  # Extract the server information, if present.
  if rpart.startswith("//"):
    (lpart, rpart) = _urlparse_splitnetloc(rpart, 2)
    components["netloc"] = lpart
    # Adds a trailing slash to the URL if no path exists.
    if rpart == "":
      rpart = "/"

    (components["username"], components["password"], components["hostname"],
      components["port"]) = _urlparse_splitauthority(lpart)

  # Extract the fragment.
  if allow_fragments:
    (rpart, components["fragment"]) = _urlparse_splitfragment(rpart)


  # Extract the query.
  (components["path"], components["query"]) = _urlparse_splitquery(rpart)

  return components


def _urlparse_splitscheme(url):
  """Parse the scheme portion of the URL"""
  # The scheme is valid only if it contains these characters.
  scheme_chars = \
    "abcdefghijklmnopqrstuvwxyz0123456789+-."

  scheme = ""
  rest = url

  spart = url.split(":", 1)
  if len(spart) == 2:

    # Normalize the scheme.
    spart[0] = spart[0].lower()

    # A scheme is valid only if it starts with an alpha character.
    if spart[0] and spart[0][0].isalpha():
      for char in spart[0]:
        if char not in scheme_chars:
          break
      (scheme, rest) = spart

  return scheme, rest


def _urlparse_splitnetloc(url, start=0):
  """Parse the netloc portion of the URL"""

  # By default, the netloc is delimited by the end of the URL.
  delim = len(url)

  # Find the left-most delimiter.
  for char in "/?#":
    xdelim = url.find(char, start)
    if xdelim >= 0:
      delim = min(delim, xdelim)

  # Return the netloc and the rest of the URL.
  return url[start:delim], url[delim:]


def _urlparse_splitauthority(netloc):
  """Parse the authority portion of the netloc"""

  # The authority can have a userinfo portion delimited by "@".
  authority = netloc.split("@", 1)

  # Default values.
  username = None
  password = None
  hostname = None
  port = None

  # Is there a userinfo portion?
  if len(authority) == 2:

    # userinfo can be split into username:password
    userinfo = authority[0].split(":", 1)

    # hostport can be split into hostname:port
    hostport = authority[1].split(":", 1)

    if userinfo[0]:
      username = userinfo[0]
    if len(userinfo) == 2:
      password = userinfo[1]

  # No userinfo portion found.
  else:

    # hostport can be split into hostname:port
    hostport = netloc.split(":", 1)

  # Is there a port value?
  if hostport[0]:
    hostname = hostport[0]
  if len(hostport) == 2:
    port = int(hostport[1], 10)

  # Return the values.
  return username, password, hostname, port


def _urlparse_splitquery(url):
  """Parse the query portion of the url"""

  qpart = url.split("?", 1)
  if len(qpart) == 2:
    query = qpart[1]
  else:
    query = ""

  return qpart[0], query


def _urlparse_splitfragment(url):
  """Parse the query portion of the url"""

  fpart = url.split("#", 1)
  if len(fpart) == 2:
    fragment = fpart[1]
  else:
    fragment = ""

  return fpart[0], fragment

#end include urlparse.repy
#begin include sockettimeout.repy
"""
<Author>
  Justin Cappos, Armon Dadgar
  This is a rewrite of the previous version by Richard Jordan

<Start Date>
  26 Aug 2009

<Description>
  A library that causes sockets to timeout if a recv / send call would
  block for more than an allotted amount of time.

"""


class SocketTimeoutError(Exception):
  """The socket timed out before receiving a response"""


class _timeout_socket():
  """
  <Purpose>
    Provides a socket like object which supports custom timeouts
    for send() and recv().
  """

  # Initialize with the socket object and a default timeout
  def __init__(self,socket,timeout=10, checkintv='fibonacci'):
    """
    <Purpose>
      Initializes a timeout socket object.

    <Arguments>
      socket:
              A socket like object to wrap. Must support send,recv,close, and willblock.

      timeout:
              The default timeout for send() and recv().

      checkintv:
              How often socket operations (send,recv) should check if
              they can run. The smaller the interval the more time is
              spent busy waiting.
    """
    # Store the socket, timeout and check interval
    self.socket = socket
    self.timeout = timeout
    self.checkintv = checkintv


  # Allow changing the default timeout
  def settimeout(self,timeout=10):
    """
    <Purpose>
      Allows changing the default timeout interval.

    <Arguments>
      timeout:
              The new default timeout interval. Defaults to 10.
              Use 0 for no timeout. Given in seconds.

    """
    # Update
    self.timeout = timeout


  # Wrap willblock
  def willblock(self):
    """
    See socket.willblock()
    """
    return self.socket.willblock()


  # Wrap close
  def close(self):
    """
    See socket.close()
    """
    return self.socket.close()


  # Provide a recv() implementation
  def recv(self,bytes,timeout=None):
    """
    <Purpose>
      Allows receiving data from the socket object with a custom timeout.

    <Arguments>
      bytes:
          The maximum amount of bytes to read

      timeout:
          (Optional) Defaults to the value given at initialization, or by settimeout.
          If provided, the socket operation will timeout after this amount of time (sec).
          Use 0 for no timeout.

    <Exceptions>
      As with socket.recv(), socket.willblock(). Additionally, SocketTimeoutError is
      raised if the operation times out.

    <Returns>
      The data received from the socket.
    """

    # It's worth noting that this fibonacci backoff begins with a 2ms poll rate, and
    # provides a simple exponential backoff scheme.

    fibonacci_backoff = False
    backoff_cap = 100 # Never use more than 100ms poll rate.

    pre_value = 1.0     # Our iterators for Fibonacci sequence.
    pre_pre_value = 1.0 #

    # Since we want to be able to initialize with static poll rates (backwards
    # compatibility) we specify a string if we're using the fibonacci backoff.
    if type(self.checkintv) is str:
      if self.checkintv == 'fibonacci':
        fibonacci_backoff = True

    # Set the timeout if None
    if timeout is None:
      timeout = self.timeout

    # Get the start time
    starttime = getruntime()

    # Block until we can read
    rblock, wblock = self.socket.willblock()
    while rblock:
      # Check if we should break
      if timeout > 0:
        # Get the elapsed time
        diff = getruntime() - starttime

        # Raise an exception
        if diff > timeout:
          raise SocketTimeoutError,"recv() timed out!"

      if fibonacci_backoff:
        # Iterate the sequence once
        sleep_length = pre_value + pre_pre_value
        pre_pre_value = pre_value
        pre_value = sleep_length

        # Make sure we don't exceed maximum backoff.
        if sleep_length > backoff_cap:
          sleep_length = backoff_cap

        # Unit conversion to seconds
        sleep_length = sleep_length / 1000.0

        # Sleep
        sleep(sleep_length)
      else: # Classic functionality.
        # Sleep
        try:
          sleep(float(self.checkintv))
        except:
          sleep(0.1)

      # If available, move to the next value of checkintv.


      # Update rblock
      rblock, wblock = self.socket.willblock()

    # Do the recv
    return self.socket.recv(bytes)


  # Provide a send() implementation
  def send(self,data,timeout=None):
    """
    <Purpose>
      Allows sending data with the socket object with a custom timeout.

    <Arguments>
      data:
          The data to send

      timeout:
          (Optional) Defaults to the value given at initialization, or by settimeout.
          If provided, the socket operation will timeout after this amount of time (sec).
          Use 0 for no timeout.

    <Exceptions>
      As with socket.send(), socket.willblock(). Additionally, SocketTimeoutError is
      raised if the operation times out.

    <Returns>
      The number of bytes sent.
    """
    # Set the timeout if None
    if timeout is None:
      timeout = self.timeout

    # Get the start time
    starttime = getruntime()

    # Block until we can write
    rblock, wblock = self.socket.willblock()
    while wblock:
      # Check if we should break
      if timeout > 0:
        # Get the elapsed time
        diff = getruntime() - starttime

        # Raise an exception
        if diff > timeout:
          raise SocketTimeoutError,"send() timed out!"

      # Sleep
      # Since switching to the fibonacci backoff, the nature of
      # this field has changed. Rather than implement the backoff
      # for checking block status (seems wasteful) we'll just use
      # a constant value. Ten ms seems appropriate.
      sleep(0.010)

      # Update rblock
      rblock, wblock = self.socket.willblock()

    # Do the recv
    return self.socket.send(data)


def timeout_openconn(desthost, destport, localip=None, localport=None, timeout=5):
  """
  <Purpose>
    Wrapper for openconn.   Very, very similar

  <Args>
    Same as Repy openconn

  <Exception>
    Raises the same exceptions as openconn.

  <Side Effects>
    Creates a socket object for the user

  <Returns>
    socket obj on success
  """

  realsocketlikeobject = openconn(desthost, destport, localip, localport, timeout)

  thissocketlikeobject = _timeout_socket(realsocketlikeobject, timeout)
  return thissocketlikeobject


def timeout_waitforconn(localip, localport, function, timeout=5):
  """
  <Purpose>
    Wrapper for waitforconn.   Essentially does the same thing...

  <Args>
    Same as Repy waitforconn with the addition of a timeout argument.

  <Exceptions>
    Same as Repy waitforconn

  <Side Effects>
    Sets up event listener which calls function on messages.

  <Returns>
    Handle to listener.
  """

  # We use a closure for the callback we pass to waitforconn so that we don't
  # have to map mainch's to callback functions or deal with potential race
  # conditions if we did maintain such a mapping.
  def _timeout_waitforconn_callback(localip, localport, sockobj, ch, mainch):
    # 'timeout' is the free variable 'timeout' that was the argument to
    #  timeout_waitforconn.
    thissocketlikeobject = _timeout_socket(sockobj, timeout)

    # 'function' is the free variable 'function' that was the argument to
    #  timeout_waitforconn.
    return function(localip, localport, thissocketlikeobject, ch, mainch)

  return waitforconn(localip, localport, _timeout_waitforconn_callback)


# a wrapper for stopcomm
def timeout_stopcomm(commhandle):
  """
    Wrapper for stopcomm.   Does the same thing...
  """

  return stopcomm(commhandle)


#end include sockettimeout.repy
#begin include urllib.repy
def urllib_quote(inputstring, safestring="/"):
  """
  <Purpose>
    Encode an inputstring such that it can be used safely in a URL or XML
    document.

  <Arguments>
    inputstring:
           The string to urlencode.

    safestring (optional):
           Specifies additional characters that should not be quoted --
           defaults to "/".

  <Exceptions>
    TypeError if the inputstring or safestring parameters aren't strings.

  <Side Effects>
    None.

  <Returns>
    Urlencoded version of the passed string.
  """

  if type(inputstring) is not str:
    raise TypeError("urllib_quote's inputstring parameter must be a string, not '"+str(type(inputstring))+"'")
  if type(safestring) is not str:
    raise TypeError("urllib_quote's safestring parameter must be a string, not '"+str(type(safestring))+"'")


  resultstr = ""

  # We go through each character in the string; if it's not in [0-9a-zA-Z]
  # we wrap it.

  safeset = set(safestring)

  for char in inputstring:
    asciicode = ord(char)
    if (asciicode >= ord("0") and asciicode <= ord("9")) or \
        (asciicode >= ord("A") and asciicode <= ord("Z")) or \
        (asciicode >= ord("a") and asciicode <= ord("z")) or \
        asciicode == ord("_") or asciicode == ord(".") or \
        asciicode == ord("-") or char in safeset:
      resultstr += char
    else:
      resultstr += "%%%02X" % asciicode

  return resultstr


def urllib_quote_plus(inputstring, safestring=""):
  """
  <Purpose>
    Encode a string to go in the query fragment of a URL.

  <Arguments>
    inputstring:
           The string to urlencode.

    safestring (optional):
           Specifies additional characters that should not be quoted --
           defaults to the empty string.

  <Exceptions>
    TypeError if the inputstring or safestring parameters aren't strings.

  <Side Effects>
    None.

  <Returns>
    Urlencoded version of the passed string.
  """

  if type(inputstring) is not str:
    raise TypeError("urllib_quote_plus' inputstring parameter must be a string, not '"+str(type(inputstring))+"'")
  if type(safestring) is not str:
    raise TypeError("urllib_quote_plus' safestring parameter must be a string, not '"+str(type(safestring))+"'")


  return urllib_quote(inputstring, safestring + " ").replace(" ", "+")


def urllib_unquote(inputstring):
  """
  <Purpose>
    Unquote a urlencoded string.

  <Arguments>
    inputstring:
           The string to unquote.

  <Exceptions>
    TypeError if the inputstring isn't a string
    ValueError thrown if the last wrapped octet isn't a valid wrapped octet
    (i.e. if the string ends in "%" or "%x" rather than "%xx". Also throws
    ValueError if the nibbles aren't valid hex digits.

  <Side Effects>
    None.

  <Returns>
    The decoded string.
  """

  if type(inputstring) is not str:
    raise TypeError("urllib_unquote's inputstring parameter must be a string, not '"+str(type(inputstring))+"'")


  resultstr = ""

  # We go through the inputstring from end to beginning, looking for wrapped
  # octets. When one is found we add it (unwrapped) and the following
  # string to the resultant string, and shorten the original inputstring.

  while True:
    lastpercentlocation = inputstring.rfind("%")
    if lastpercentlocation < 0:
      break

    wrappedoctetstr = inputstring[lastpercentlocation+1:lastpercentlocation+3]
    if len(wrappedoctetstr) != 2:
      raise ValueError("Quoted string is poorly formed")

    resultstr = \
        chr(int(wrappedoctetstr, 16)) + \
        inputstring[lastpercentlocation+3:] + \
        resultstr
    inputstring = inputstring[:lastpercentlocation]

  resultstr = inputstring + resultstr
  return resultstr


def urllib_unquote_plus(inputstring):
  """
  <Purpose>
    Unquote the urlencoded query fragment of a URL.

  <Arguments>
    inputstring:
           The string to unquote.

  <Exceptions>
    TypeError if the inputstring isn't a string
    ValueError thrown if the last wrapped octet isn't a valid wrapped octet
    (i.e. if the inputstring ends in "%" or "%x" rather than "%xx". Also throws
    ValueError if the nibbles aren't valid hex digits.

  <Side Effects>
    None.

  <Returns>
    The decoded string.
  """
  if type(inputstring) is not str:
    raise TypeError("urllib_unquote_plus' inputstring parameter must be a string, not '"+str(type(inputstring))+"'")

  return urllib_unquote(inputstring.replace("+", " "))


def urllib_quote_parameters(inputdictionary):
  """
  <Purpose>
    Encode a dictionary of (key, value) pairs into an HTTP query string or
    POST body (same form).

  <Arguments>
    dictionary:
           The dictionary to quote.

  <Exceptions>
    TypeError if the inputdictionary isn't a dict.

  <Side Effects>
    None.

  <Returns>
    The quoted dictionary.
  """
  if type(inputdictionary) is not dict:
    raise TypeError("urllib_quote_parameters' inputstringdictionary parameter must be a dict, not '"+str(type(inputstring))+"'")

  quoted_keyvals = []
  for key, val in inputdictionary.items():
    quoted_keyvals.append("%s=%s" % (urllib_quote(key), urllib_quote(val)))

  return "&".join(quoted_keyvals)


def urllib_unquote_parameters(inputstring):
  """
  <Purpose>
    Decode a urlencoded query string or POST body.

  <Arguments>
    inputstring:
           The string to decode.

  <Exceptions>
    TypeError if the inputstring isn't a string
    ValueError if the inputstring is poorly formed.

  <Side Effects>
    None.

  <Returns>
    A dictionary mapping keys to values.
  """

  if type(inputstring) is not str:
    raise TypeError("urllib_unquote_parameters' inputstring parameter must be a string, not '"+str(type(inputstring))+"'")

  keyvalpairs = inputstring.split("&")
  res = {}

  for quotedkeyval in keyvalpairs:
    # Throw ValueError if there is more or less than one '='.
    quotedkey, quotedval = quotedkeyval.split("=")
    key = urllib_unquote_plus(quotedkey)
    val = urllib_unquote_plus(quotedval)
    res[key] = val

  return res

#end include urllib.repy


class HttpConnectionError(Exception):
  """
  Error indicating that the web server has unexpectedly dropped the
  connection.
  """


class HttpBrokenServerError(Exception):
  """
  Error indicating that the web server has sent us complete garbage instead
  of something resembling HTTP.
  """


def httpretrieve_open(url, querydata=None, postdata=None,\
    httpheaders=None, proxy=None, timeout=None):
  """
  <Purpose>
     Returns a file-like object that can be used to read the content from
     an HTTP server. Follows 3xx redirects.

  <Arguments>
    url:
           The URL to perform a GET or POST request on.
    postdata (optional):
           A dictionary of form data or a string to POST to the server.
           Passing a non-None value results in a POST request being sent
           to the server.
    querydata (optional):
           A dictionary of form data or a string to send as the query
           string to the server.

           If postdata is omitted, the URL is retrieved with GET. If
           both postdata and querydata are omitted, there is no query
           string sent in the request.

           For both querydata and postdata, strings are sent *unmodified*.
           This means you probably should encode them first, with
           urllib_quote().
    httpheaders (optional):
           A dictionary of supplemental HTTP request headers to add to the
           request.
    proxy (optional):
           A proxy server 2-tuple to bind to: ('host', port).
    timeout (optional):
           A timeout for establishing a connection to the web server,
           sending headers, and reading the response headers.

           If excluded or None, never times out.

  <Exceptions>
    ValueError if given an invalid URL, or malformed limit or timeout
      values. This is also raised if the user attempts to call a method
      on the file-like object after closing it.

    HttpConnectionError if opening the connection fails, or if the
      connection is closed by the server before we expect.

    SocketTimeoutError if the timeout is exceeded.

    HttpBrokenServerError if the response or the Location response header
      is malformed.

  <Side Effects>
    None

  <Returns>
    Returns a file-like object which can be used to read the body of
    the response from the web server. The protocol version spoken by the
    server, status code, and response headers are available as members of
    the object.
  """

  starttimefloat = getruntime()

  # Check if the URL is valid and get host, path, port and query
  parsedurldict = urlparse_urlsplit(url)
  hoststr = parsedurldict['hostname']
  pathstr = parsedurldict['path']
  portint = parsedurldict.get('port')
  portint = portint or 80

  if parsedurldict['scheme'] != 'http':
    raise ValueError("URL doesn't seem to be for the HTTP protocol.")
  if hoststr is None:
    raise ValueError("Missing hostname.")

  """
  #removed line---> parsedurldict['query'] is not None
  #because it was broken
  #if parsedurldict['query'] != "":
    #raise ValueError("URL cannot include a query string.")
  """

  # Typical HTTP sessions consist of (optionally, a series of pairs of) HTTP
  # requests followed by HTTP responses. These happen serially.

  # JAC: Set this up so that we can raise the right error if the
  # timeout_openconn doesn't work.
  sockobj = None

  # Open connection to the web server
  try:
    if proxy is not None:
      # if there is a proxy, open a connection with the proxy instead of the actual server
      # use the timeout we are given (or none)
      sockobj = timeout_openconn(proxy[0], proxy[1], timeout=timeout)
    else:
      # if there is no proxy open a connection with server directly
      # use the timeout we are given (or none)
      sockobj = timeout_openconn(hoststr, portint, timeout=timeout)

  except Exception, e:
    # If a socket object was created, we want to clean in up.
    if sockobj:
      sockobj.close()

    if repr(e).startswith("timeout("):
      raise HttpConnectionError("Socket timed out connecting to host/port.")
    raise

  try:
    # Builds the HTTP request:
    httprequeststr = _httpretrieve_build_request(hoststr, portint, pathstr, \
        querydata, postdata, httpheaders, proxy)

    # Send the full HTTP request to the web server.
    _httpretrieve_sendall(sockobj, httprequeststr)

    # Now, we're done with the HTTP request part of the session, and we need
    # to get the HTTP response.

    # Check if we've timed out (if the user requested a timeout); update the
    # socket timeout to reflect the time taken sending the request.
    if timeout is None:
      sockobj.settimeout(0)
    elif getruntime() - starttimefloat >= timeout:
      raise SocketTimeoutError("Timed out")
    else:
      sockobj.settimeout(timeout - (getruntime() - starttimefloat))

    # Receive the header lines from the web server (a series of CRLF-terminated
    # lines, terminated by an empty line, or by the server closing the
    # connection.
    headersstr = ""
    while not headersstr.endswith("\r\n\r\n"):
      try:
        # This should probably be replaced with page-sized reads in the future,
        # but for now, the behavior is at least correct.
        headersstr += sockobj.recv(1)
      except Exception, e:
        if str(e) == "Socket closed":
          break
        else:
          raise

    httpheaderlist = headersstr.split("\r\n")
    # Ignore (a) trailing blank line(s) (for example, the response header-
    # terminating blank line).
    while len(httpheaderlist) > 0 and httpheaderlist[-1] == "":
      httpheaderlist = httpheaderlist[:-1]

    # Get the status code and status message from the HTTP response.
    statuslinestr, httpheaderlist = httpheaderlist[0], httpheaderlist[1:]

    # The status line should be in the form: "HTTP/1.X NNN SSSSS", where
    # X is 0 or 1, NNN is a 3-digit status code, and SSSSS is a 'user-friendly'
    # string representation of the status code (may contain spaces).
    statuslinelist = statuslinestr.split(' ', 2)

    if len(statuslinelist) < 3:
      raise HttpBrokenServerError("Server returned garbage for HTTP " + \
        "response (status line missing one or more fields).")

    if not statuslinelist[0].startswith('HTTP'):
      raise HttpBrokenServerError("Server returned garbage for HTTP " + \
          "response (invalid response protocol in status line).")

    friendlystatusstr = statuslinelist[2]
    try:
      statusint = int(statuslinelist[1])
    except ValueError, e:
      raise HttpBrokenServerError("Server returned garbage for HTTP " + \
        "response (status code isn't integer).")

    httpheaderdict = _httpretrieve_parse_responseheaders(httpheaderlist)

    # If we got any sort of redirect response, follow the redirect. Note: we
    # do *not* handle the 305 status code (use the proxy as specified in the
    # Location header) at all; I think this is best handled at a higher layer
    # anyway.
    if statusint in (301, 302, 303, 307):
      sockobj.close()
      try:
        redirecturlstr = httpheaderdict["Location"][0]
      except (KeyError, IndexError), ke:
        # When a server returns a redirect status code (3xx) but no Location
        # header, some clients, e.g. Firefox, just show the response body
        # as they would normally for a 2xx or 4xx response. So, I think we
        # should ignore a missing Location header and just return the page
        # to the caller.
        pass
      else:
        # If the server did send a redirect location, let's go there.
        return httpretrieve_open(redirecturlstr)

    # If we weren't requested to redirect, and we didn't, return a read-only
    # file-like object (representing the response body) to the caller.
    return _httpretrieve_filelikeobject(sockobj, httpheaderdict, \
        (statuslinelist[0], statusint, friendlystatusstr))

  except:
    # If any exception occured after the socket was open, we want to make
    # sure that the socket is cleaned up if it is still open before we
    # raise the exception.
    if sockobj:
      sockobj.close()

    raise


def httpretrieve_save_file(url, filename, querydata=None, postdata=None, \
    httpheaders=None, proxy=None, timeout=None):
  """
  <Purpose>
    Perform an HTTP request, and save the content of the response to a
    file.

  <Arguments>
    filename:
           The file name to save the response to.
    Other arguments:
           See documentation for httpretrieve_open().

  <Exceptions>
    This function will raise any exception raised by Repy file objects
    in opening, writing to, and closing the file.

    This function will all also raise any exception raised by
    httpretrieve_open(), for the same reasons.

  <Side Effects>
    Writes the body of the response to 'filename'.

  <Returns>
    None
  """

  # Open the output file object and http file-like object.
  outfileobj = open(filename, 'w')
  httpobj = httpretrieve_open(url, querydata=querydata, postdata=postdata, \
      httpheaders=httpheaders, proxy=proxy, timeout=timeout)

  # Repeatedly read from the file-like HTTP object into our file, until the
  # response is finished.
  responsechunkstr = None
  while responsechunkstr != '':
    responsechunkstr = httpobj.read(4096)
    outfileobj.write(responsechunkstr)

  outfileobj.close()
  httpobj.close()


def httpretrieve_get_string(url, querydata=None, postdata=None, \
    httpheaders=None, proxy=None, timeout=30):
  """
  <Purpose>
    Performs an HTTP request on the given URL, using POST or GET,
    returning the content of the response as a string. Uses
    httpretrieve_open.

  <Arguments>
    See httpretrieve_open.

  <Exceptions>
    See httpretrieve_open.

  <Side Effects>
    None.

  <Returns>
    Returns the body of the HTTP response (no headers).
  """

  # Open a read-only file-like object for the HTTP request.
  httpobj = httpretrieve_open(url, querydata=querydata, postdata=postdata, \
      httpheaders=httpheaders, proxy=proxy, timeout=timeout)

  # Read all of the response and return it.
  try:
    return httpobj.read()
  finally:
    httpobj.close()


class _httpretrieve_filelikeobject:
  # This class implements a file-like object used for performing HTTP
  # requests and retrieving responses.

  def __init__(self, sock, headers, httpstatus):
    # The socket-like object connected to the HTTP server. Headers have
    # already been read.
    self._sockobj = sock

    # If this is set, the close() method has already been called, so we
    # don't accept future reads.
    self._fileobjclosed = False

    # This flag is set if we've finished recieving the entire response
    # from the server.
    self._totalcontentisreceived = False

    # This integer represents the number of bytes read so far.
    self._totalread = 0

    # This is the dictionary of HTTP response headers associated with this
    # file-like object.
    self.headers = headers

    # The HTTP status tuple of this response, e.g. ("HTTP/1.0", 200, "OK")
    self.httpstatus = httpstatus


  def read(self, limit=None, timeout=None):
    """
    <Purpose>
      Behaves like Python's file.read(), with the potential to raise
      additional informative exceptions.

    <Arguments>
      limit (optional):
            The maximum amount of data to read. If omitted or None, this
            reads all available data.

    <Exceptions>
      See file.read()'s documentation, as well as that of
      httpretrieve_open().

    <Side Effects>
      None.

    <Returns>
      See file.read().
    """

    # Raise an error if the caller has already close()d this object.
    if self._fileobjclosed:
      raise ValueError("I/O operation on closed file")

    # If we've finished reading everything we can from the server, return the
    # empty string.
    if self._totalcontentisreceived:
      return ''

    lefttoread = None
    if limit is not None:
      lefttoread = limit

      # Sanity check type/value of limit.
      if type(limit) is not int:
        raise TypeError("Expected an integer or None for read() limit")
      elif limit < 0:
        raise ValueError("Expected a non-negative integer for read() limit")

    if timeout is None:
      self._sockobj.settimeout(0)
    else:
      self._sockobj.settimeout(timeout)

    # Try to read up to limit, or until there is nothing left.
    httpcontentstr = ''
    while True:
      try:
        contentchunkstr = self._sockobj.recv(lefttoread or 4096)
      except Exception, e:
        if str(e) == "Socket closed":
          self._totalcontentisreceived = True
          break
        else:
          raise

      httpcontentstr += contentchunkstr
      self._totalread += len(contentchunkstr)
      if limit is not None:
        if len(contentchunkstr) == lefttoread:
          break
        else:
          lefttoread -= len(contentchunkstr)
      if contentchunkstr == "":
        self._totalcontentisreceived = True
        break

    return httpcontentstr


  def close(self):
    """
    <Purpose>
      Close the file-like object.

    <Arguments>
      None

    <Exceptions>
      None

    <Side Effects>
      Disconnects from the HTTP server.

    <Returns>
      Nothing
    """
    self._fileobjclosed = True
    self._sockobj.close()


def _httpserver_put_in_headerdict(res, lastheader, lastheader_str):
  # Helper function that tries to put the header into a dictionary of lists,
  # 'res'.
  if lastheader is not None:
    if lastheader not in res:
      res[lastheader] = []
    res[lastheader].append(lastheader_str.strip())


def _httpretrieve_parse_responseheaders(headerlines):
  # Parse rfc822-style headers (this could be abstracted out to an rfc822
  # library that would be quite useful for internet protocols). Returns
  # a dictionary mapping headers to arrays of values. E.g.:
  #
  # Foo: a
  # Bar:
  #   b
  # Bar: c
  #
  # Becomes: {"Foo": ["a"], "Bar": ["b", "c"]}

  # These variables represent the key and value of the last header we found,
  # unless we are parsing the very first header. E.g., if we've just read:
  #   Content-Type: text/html
  # Then, lastheaderkeystr == "Content-Type",
  # lastheadervaluestr == "text/html"

  lastheaderkeystr = None
  lastheadervaluestr = ""

  resdict = {}

  if len(headerlines) == 0:
    return {}

  try:
    # Iterate over the request header lines:
    for i in range(len(headerlines)):
      # Lines with leading non-CRLF whitespace characters are part of the
      # previous line (see rfc822 for details).
      if headerlines[i][0] in (" ", "\t") and lastheaderkeystr is not None:
        lastheadervaluestr += headerlines[i]
      else:
        _httpserver_put_in_headerdict(resdict, lastheaderkeystr, lastheadervaluestr)
        lastheaderkeystr, lastheadervaluestr = headerlines[i].split(":", 1)

    # Add the last line to the result dictionary.
    _httpserver_put_in_headerdict(resdict, lastheaderkeystr, lastheadervaluestr)

    return resdict

  except IndexError, idx:
    raise HttpBrokenServerError("Server returned garbage for HTTP" + \
        " response. Bad headers.")


def _httpretrieve_build_request(host, port, path, querydata, postdata, \
    httpheaders, proxy):
  # Builds an HTTP request from these parameters, returning it as
  # a string.

  # Sanity checks:
  if path == "":
    raise ValueError("Invalid path -- empty string.")
  if postdata is not None and type(postdata) not in (str, dict):
    raise TypeError("Postdata should be a dict of form-data or a string")
  if querydata is not None and type(querydata) not in (str, dict):
    raise TypeError("Querydata should be a dict of form-data or a string")
  if httpheaders is not None and type(httpheaders) is not dict:
    raise TypeError("Expected HTTP headers as a dictionary.")

  # Type-conversions:
  if type(querydata) is dict:
    querydata = urllib_quote_parameters(querydata)
  elif querydata is None:
    querydata = ""

  if type(postdata) is dict:
    postdata = urllib_quote_parameters(postdata)

  # Default to GET, unless the caller specifies a message body to send.
  methodstr = "GET"
  if postdata is not None:
    methodstr = "POST"

  # Encode the path and querystring part of the request.
  resourcestr = querydata
  if querydata != "":
    resourcestr = "?" + resourcestr

  # Encode the HTTP request line and headers:
  if proxy is not None:
    # proxy exists thus the request header should include the original requested url
    requeststr = methodstr + ' http://' + host + ':' + str(port) + path + resourcestr + ' HTTP/1.0\r\n'
  else:
    # there is no proxy; send normal http request
    requeststr = methodstr + ' ' + path + resourcestr + ' HTTP/1.0\r\n'

  if httpheaders is not None:
    # Most servers require a 'Host' header for normal functionality
    # (especially in the case of multiple domains being hosted on a
    # single server).
    if "Host" not in httpheaders:
      requeststr += "Host: " + host + ':' + str(port) + "\r\n"

    for key, val in httpheaders.items():
      requeststr += key + ": " + val + '\r\n'

  # Affix post-data related headers and content:
  if methodstr == "POST":
    requeststr += 'Content-Length: ' + str(len(postdata)) + '\r\n'

  # The empty line terminates HTTP headers.
  requeststr += '\r\n'

  # If we're a POST request, affix any requested data to the message body.
  if methodstr == "POST":
    requeststr += postdata

  return requeststr


def _httpretrieve_sendall(sockobj, datastr):
  # Helper function that attempts to dump all of the data in datastr to the
  # socket sockobj (data is any arbitrary bytes).
  while len(datastr) > 0:
    datastr = datastr[sockobj.send(datastr):]

#end include httpretrieve.repy

if callfunc == 'initialize':
  # print "bing"
  # result = httpretrieve_get_string("http://www.bing.com")
  headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/html"}
  url = "http://www.bing.com"
  print url
  print urlparse_urlsplit(url)
  result = httpretrieve_open(url, httpheaders=headers)
  print '----->', result
  resultfile = open("index.html", "w")
  resultfile.write(result.read())
  resultfile.close()
  result.close()