Skip to content

Instantly share code, notes, and snippets.

@costastf
Created April 14, 2018 11:05
Show Gist options
  • Save costastf/39fd2c56ba116148d5e837833c4b27d3 to your computer and use it in GitHub Desktop.
Save costastf/39fd2c56ba116148d5e837833c4b27d3 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2.7
# -*- coding: UTF-8 -*-
# File: urlutils.py
#
__author__ = 'Costas Tyfoxylos <costas.tyf@gmail.com>'
__docformat__ = 'plaintext'
__date__ = '2015-01-18'
import logging
# This is the main prefix used for logging
logger_basename = 'urlutils'
logger = logging.getLogger(logger_basename)
from urllib import quote
import grequests
# for this to work we need a patched grequests
# sudo pip install https://github.com/agsimeonov/grequests/archive/master.zip
class DummyResponse(object):
class Elapsed(object):
def __init__(self):
pass
@staticmethod
def total_seconds():
return 'Not applicable'
def __init__(self, response, exception):
self.text = 'Not applicable'
self.headers = 'Not applicable'
self.elapsed = self.Elapsed()
self.url = response.url
self.status_code = str(exception)
self.status = False
class UrlObject(object):
def __init__(self, connection_information):
self.url = connection_information['url']
self.auth = connection_information.get('auth')
self.username = connection_information.get('user')
self.password = connection_information.get('pass')
self.httpResponse = ''
self.responseTime = ''
self.responseHeaders = ''
self.responseBody = ''
self.response = ''
def injest_response(self, response):
self.response = response
self.httpResponse = response.status_code
self.responseTime = response.elapsed.total_seconds()
self.responseHeaders = response.headers
self.responseBody = response.text.encode('utf-8')
self.validate_response()
def validate_response(self):
pass
class Parallelizer(object):
def __init__(self, resource_list):
self.resourceList = resource_list
@staticmethod
def _exception_handler(request, exception):
response = DummyResponse(request, exception)
return response
@staticmethod
def _authentication_hack(url, username, password):
protocol, path = url.split('//')
url = '{protocol}//{username}:{password}@{path}'.format(protocol=protocol,
username=username,
# we quote the password through the urllib quote
# so we escape any invalid url characters
password=quote(password, ''),
path=path)
return url
def fetch(self):
urls = []
for resource in self.resourceList:
if not resource.auth:
urls.append(resource.url)
else:
urls.append(self._authentication_hack(resource.url,
resource.username,
resource.password))
requests = (grequests.get(url) for url in urls)
responses = grequests.map(requests, exception_handler=self._exception_handler)
_ = [resource.injest_response(response) for resource, response in zip(self.resourceList, responses)]
return self.resourceList
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment