Skip to content

Instantly share code, notes, and snippets.

@masroore
Created August 5, 2014 05:47
Show Gist options
  • Save masroore/36da7c02ce8c1d5b79b1 to your computer and use it in GitHub Desktop.
Save masroore/36da7c02ce8c1d5b79b1 to your computer and use it in GitHub Desktop.
import pycurl
import urllib
import copy
import re
import signal
import os
import random
from urlparse import urlsplit
from libpy.html import detect_encoding
# TODO:
# fetching the binary content even with unicode=True fails - PIL couldn't load png file
# fetched with Grab
# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
# Comments: http://curl.haxx.se/mail/curlpython-2005-06/0004.html
try:
import signal
from signal import SIGPIPE, SIG_IGN
signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
pass
except ValueError:
# Do this to ignore ValueError: signal only works in main thread
# in python 2.5 WTF???
pass
class Error(pycurl.error):
"""Used to indicate network error. The same as pycrl.errror"""
class SiteError(Error):
"""
Used to indicate error of the remote resource
It is usefull for example when we query server which name can not
be resolved
"""
def get(url, config=None, soup=False):
"""Simple function for fetching url using grab instance"""
curl = Grab()
curl.setup('url', url)
if config:
curl.setup(config)
curl.run()
if soup:
return curl.soup
else:
return curl.body
class Grab:
"""Fancy wrapper for pycurl library"""
def __init__(self):
self.timeout = 20
self.logFile = None
self.config = {}
self._bodyCallbacks = []
self.debug = False
self.lastError = None
self.freshPostData = False
self.cookies_map = {}
self.oldUrl = None
self.debug = False
self.auto_cookies = False
self.generate_client_profile()
self.head = ''
self.body = ''
self.headers = {}
self.cookies = {}
self.unicode = True
self.encoding = None
self.use_tidy = False
self.out_headers = None
self.max_redirects = 5
def generate_client_profile(self):
self.default_headers = {
'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language': 'ru,en-us;q=0.%(x)d,en;q=0.3;%(lang)s' % {'x': random.randint(5, 9),
'lang': random.choice(['ua', 'gb', 'uk'])},
#'Accept-Encoding': 'gzip,compress;q=0.%(x)d,deflate;q=0' % {'x': random.randint(5, 9)},
'Accept-Charset': 'utf-8,windows-1251;q=0.%(x)d,*;q=0.%(x)d' % {'x': random.randint(5, 9)}
}
#print self.default_headers
self.default_user_agent = random.choice(useragents)
#print self.default_user_agent
def _bodyCallback(self, data):
"""Used to process anser body"""
if self.nobody:
return 0
else:
self.body = self.body + data
if self.maxsize:
if len(self.body) > self.maxsize:
return 0
if self._bodyCallbacks:
for callback in self._bodyCallbacks:
if not callback(data):
return 0
return len(data)
def _headCallback(self, data):
"""Used to process answer headers"""
if self.nohead:
return 0
else:
self.head = self.head + data
return len(data)
def _debug_callback(self, type, data):
if type == 2: # pycurl.CURLINFO_HEADER_OUT: WTF?? pycurl.HEADER_OUT is invalid
self.out_headers = data
def request(self):
"""Run prepared curl request"""
self.curl.perform()
self.curl.close()
def setup(self, name, value = None):
"""
Configure curl request. Arguments variants:
1. name - option name, value - option value
2. name is dictionary, value is None
"""
if isinstance(name, dict):
for key, value in name.items():
self.setup(key, value)
else:
if 'post' == name:
self.freshPostData = True
self.config[name] = value
def _changeState(self, name, value):
"""
Configure internal pycurl instance before request
"""
if isinstance(name, int):
self.curl.setopt(name, value)
# TODO: is it possible that dict passed to changeState?
elif isinstance(name, dict):
for key in name:
self.setup(key, name[key])
if 'post' == name:
if value:
self.curl.setopt(pycurl.POSTFIELDS, urllib.urlencode(value))
else:
self.curl.setopt(pycurl.HTTPGET,1)
elif 'logfile' == name:
self.logFile = value
elif 'url' == name:
self.curl.setopt(pycurl.URL, str(value))
elif 'proxy' == name:
if value:
proxy = value
else:
proxy = ''
self.curl.setopt(pycurl.PROXY, proxy)
elif 'timeout' == name:
self.curl.setopt(pycurl.TIMEOUT, value)
elif 'connect_timeout' == name:
self.curl.setopt(pycurl.CONNECTTIMEOUT, value)
elif 'referer' == name:
self.curl.setopt(pycurl.REFERER, str(value))
elif 'cookies' == name:
for name, value in value.items():
self.register_cookie(name, value)
elif 'autocookies' == name:
pass
elif 'nobody' == name:
if True == value:
self.nobody = True
elif 'nohead' == name:
if True == value:
self.nohead = True
elif 'maxsize' == name:
self.maxsize = value
elif 'redirect' == name:
self.curl.setopt(pycurl.FOLLOWLOCATION, value)
elif 'max_redirects' == name:
self.curl.setopt(pycurl.MAXREDIRS, value)
elif 'userpwd' == name:
self.curl.setopt(pycurl.USERPWD, value)
elif 'bodyCallback' == name:
if isinstance(name, (list, tuple)):
self._bodyCallbacks = value
else:
self._bodyCallbacks.append(value)
elif 'user_agent' == name:
self.curl.setopt(pycurl.USERAGENT, value)
elif 'headers' == name:
self.curl.setopt(pycurl.HTTPHEADER, ['%s: %s' % (a, b) for a, b in value.iteritems()])
elif 'autoreferer' == name:
if not 'referer' in self.config:
if not self.oldUrl is None:
self.curl.setopt(pycurl.REFERER, str(self.oldUrl))
elif 'unicode' == name:
self.unicode = bool(value)
elif 'use_tidy' == name:
self.use_tidy = bool(value)
elif 'gzip' == name:
self.gzip = value
elif 'debug' == name:
self.curl.setopt(pycurl.VERBOSE, value)
else:
raise Exception, "unknown option: %s" % name
def _prepare(self):
"""Prepare for request"""
self.curl = pycurl.Curl()
self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)
self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
self.curl.setopt(pycurl.TIMEOUT, self.timeout)
self.curl.setopt(pycurl.CONNECTTIMEOUT, self.timeout)
self.curl.setopt(pycurl.MAXREDIRS, self.max_redirects)
self.curl.setopt(pycurl.NOSIGNAL, 1)
self.curl.setopt(pycurl.WRITEFUNCTION, self._bodyCallback)
self.curl.setopt(pycurl.HEADERFUNCTION, self._headCallback)
self.curl.setopt(pycurl.DEBUGFUNCTION, self._debug_callback)
#self.curl.setopt(pycurl.VERBOSE, True)
self.head = ''
self.body = ''
self.headers = {}
self.cookies = {}
self.maxsize = 0
self.nobody = False
self.nohead = False
self.lastError = ''#pycurl.CURLE_OK
self.encoding = None
if not 'user_agent' in self.config:
self.config['user_agent'] = self.default_user_agent
# Set up default headers if they do not exist
headers = self.config.setdefault('headers', {})
for header, value in self.default_headers.iteritems():
if not header in headers:
headers[header] = value
if self.config.get('gzip'):
if not header in headers:
headers['Accept-Encoding'] = 'gzip'
for name, value in self.config.items():
self._changeState(name, value)
# If autocookies mode is enabled then use all registered cookies for this domain
# else use cookies given in setup calls (if any)
cookies = ()
if self.config.get('autocookies'):
cookies = self.get_registered_cookies()
elif self.config.get('cookies'):
cookies = self.config['cookies']
if cookies:
parts = []
for name, value in cookies.iteritems():
parts.append('%s=%s;' % (urllib.quote_plus(name),
urllib.quote_plus(value)))
self.curl.setopt(pycurl.COOKIE, ''.join(parts))
# If we query new url we must reset old post and cookes information
# if they was not defined for new url becouse their values
# are still stored in the self.config
if self.oldUrl != self.config['url']:
if not self.freshPostData:
self.curl.setopt(pycurl.HTTPGET, 1)
self.freshPostData = False
def run(self):
"""Do request"""
self._prepare()
try:
self.curl.perform()
except pycurl.error, err:
# CURLE_WRITE_ERROR
# An error occurred when writing received data to a local file, or
# an error was returned to libcurl from a write callback.
# This is expected error and we should ignore it
if 23 == err[0]:
pass
else:
self._finish()
self.lastError = err
# 6 - could not resolve host
# 47 - too many redirects
# 52 - nothing was returned from the server
# 58 - problem with the local client certificate
# 59 - couldn't use specified cipher
# 60 - problem with the CA cert (path? access rights?)
if err[0] in (6, 47, 52, 58, 59, 60):
raise SiteError, err
raise Error, err
self._finish()
def _finish(self):
"""Process query result"""
self.oldUrl = self.config['url']
if self.maxsize:
self.body = self.body[0:self.maxsize]
if self.logFile:
open(self.logFile, 'w').write(
self.config['url'] + '\n' + \
self.curl.errstr() + '\n' + \
self.head + '\n' + self.body)
for line in re.split('\r?\n', self.head):
try:
name, value = line.split(': ', 1)
if 'Set-Cookie' == name:
match = re.search('^([^=]+)=([^;]+)*', value)
if match:
self.cookies[match.group(1)] = match.group(2)
else:
self.headers[name] = value
except ValueError:
pass
for name, value in self.cookies.iteritems():
self.register_cookie(name, value)
if self.headers.get('Content-Encoding') == 'gzip':
import StringIO
import gzip
gzipper = gzip.GzipFile(fileobj=StringIO.StringIO(self.body))
self.body = gzipper.read()
if self.unicode:
self.decode_body()
if self.use_tidy:
if not self.unicode:
raise Exception('`use_tidy` options requires `unicode` option but it is off now')
else:
self.apply_tidy()
#self.curl.close()
def decode_body(self):
encoding = detect_encoding(self.body, headers=self.headers)
self.encoding = encoding
if encoding:
self.body = self.body.decode(encoding)
else:
# TODO: choose the proper way for handling case of unknown encoding
raise Exception('Could not determine encoding')
#self.body = self.body.decode('utf-8', 'ignore')
def apply_tidy(self):
print 'fuck'
import tidy
self.original_body = self.body
data = self.body.encode('utf-8')
options = dict(
output_xhtml=1,
show_body_only=0,
force_output=1,
char_encoding='utf8')
data = str(tidy.parseString(data, **options))
self.body = data.decode('utf-8')
def getinfo(self,key):
return self.curl.getinfo(getattr(pycurl, key))
def errstr(self):
"""get request error text"""
self.curl.errstr()
def getConfig(self, name):
try:
return self.config[name]
except KeyError:
return ''
def code(self):
return self.getinfo('RESPONSE_CODE')
def get_current_host(self):
domain = urlsplit(self.config['url'])[1]
host = domain.rsplit('.', 1)[-1]
return host
def register_cookie(self, name, value):
self.cookies_map.setdefault(self.get_current_host(), {})[name] = value
def get_registered_cookies(self):
return self.cookies_map.get(self.get_current_host(), {})
@property
def soup(self):
from BeautifulSoup import BeautifulSoup
return BeautifulSoup(self.body)
useragents = (
'Mozilla/4.0 (compatible; MSIE 6.0; MSN 2.5; Windows 98)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 1.1.4322; Media Center PC 4.0; .NET CLR 2.0.50727)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)',
'Mozilla/4.0 (compatible; MSIE 7.0b; Win32)',
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; Arcor 5.005; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; YPC 3.0.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.8) Gecko/20050511',
'Mozilla/5.0 (X11; U; Linux i686; cs-CZ; rv:1.7.12) Gecko/20050929',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.7.8) Gecko/20050609 Firefox/1.0.4',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.9) Gecko/20050711 Firefox/1.0.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.10) Gecko/20050716 Firefox/1.0.6',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl; rv:1.8) Gecko/20051107 Firefox/1.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.1) Gecko/20060111 Firefox/1.5.0.1',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.2) Gecko/20060308 Firefox/1.5.0.2',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.6) Gecko/20060808 Fedora/1.5.0.6-2.fc5 Firefox/1.5.0.6 pango-text',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.0.7) Gecko/20060909 Firefox/1.5.0.7',
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.8.1) Gecko/20060601 Firefox/2.0 (Ubuntu-edgy)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20070220 Firefox/2.0.0.2',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20070221 SUSE/2.0.0.2-6.1 Firefox/2.0.0.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9',
'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9',
'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.9a1) Gecko/20061204 GranParadiso/3.0a1',
'Opera/8.0 (X11; Linux i686; U; cs)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.50',
'Mozilla/5.0 (Windows NT 5.1; U; en) Opera 8.50',
'Opera/8.51 (Windows NT 5.1; U; en)',
'Opera/9.0 (Windows NT 5.1; U; en)',
'Opera/9.01 (X11; Linux i686; U; en)',
'Opera/9.02 (Windows NT 5.1; U; en)',
'Opera/9.10 (Windows NT 5.1; U; en)',
'Opera/9.23 (Windows NT 5.1; U; ru)',
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment