masroore/grab.py

## grab.py
import pycurl
import urllib
import copy
import re
import signal
import os
import random
from urlparse import urlsplit

from libpy.html import detect_encoding

# TODO:
# fetching the binary content even with unicode=True fails - PIL couldn't load png file
# fetched with Grab

# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
# Comments: http://curl.haxx.se/mail/curlpython-2005-06/0004.html
try:
    import signal
    from signal import SIGPIPE, SIG_IGN
    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
    pass
except ValueError:
    # Do this to ignore ValueError: signal only works in main thread
    # in python 2.5 WTF???
    pass

class Error(pycurl.error):
    """Used to indicate network error. The same as pycrl.errror"""

class SiteError(Error):
    """
    Used to indicate error of the remote resource
    It is usefull for example when we query server which name can not
    be resolved
    """

def get(url, config=None, soup=False):
    """Simple function for fetching url using grab instance"""

    curl = Grab()
    curl.setup('url', url)
    if config:
        curl.setup(config)
    curl.run()
    if soup:
        return curl.soup
    else:
        return curl.body


class Grab:
    """Fancy wrapper for pycurl library"""

    def __init__(self):
        self.timeout = 20
        self.logFile = None
        self.config = {}
        self._bodyCallbacks = []
        self.debug = False
        self.lastError = None
        self.freshPostData = False
        self.cookies_map = {}
        self.oldUrl = None
        self.debug = False
        self.auto_cookies = False
        self.generate_client_profile()
        self.head = ''
        self.body = ''
        self.headers = {}
        self.cookies = {}
        self.unicode = True
        self.encoding = None
        self.use_tidy = False
        self.out_headers = None
        self.max_redirects = 5

    def generate_client_profile(self):
        self.default_headers = {
            'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
            'Accept-Language': 'ru,en-us;q=0.%(x)d,en;q=0.3;%(lang)s' % {'x': random.randint(5, 9),
                                                                       'lang': random.choice(['ua', 'gb', 'uk'])},
            #'Accept-Encoding': 'gzip,compress;q=0.%(x)d,deflate;q=0' % {'x': random.randint(5, 9)},
            'Accept-Charset': 'utf-8,windows-1251;q=0.%(x)d,*;q=0.%(x)d' % {'x': random.randint(5, 9)}
            }
        #print self.default_headers
        self.default_user_agent = random.choice(useragents)
        #print self.default_user_agent

    def _bodyCallback(self, data):
        """Used to process anser body"""

        if self.nobody:
            return 0
        else:
            self.body = self.body + data
            if self.maxsize:
                if len(self.body) > self.maxsize:
                    return 0
            if self._bodyCallbacks:
                for callback in self._bodyCallbacks:
                    if not callback(data):
                        return 0
        return len(data)

    def _headCallback(self, data):
        """Used to process answer headers"""

        if self.nohead:
            return 0
        else:
            self.head = self.head + data
        return len(data)

    def _debug_callback(self, type, data):
        if type == 2: # pycurl.CURLINFO_HEADER_OUT: WTF?? pycurl.HEADER_OUT is invalid
            self.out_headers = data


    def request(self):
        """Run prepared curl request"""

        self.curl.perform()
        self.curl.close()

    def setup(self, name, value = None):
        """
        Configure curl request. Arguments variants:
        1. name - option name, value - option value
        2. name is dictionary, value is None
        """

        if isinstance(name, dict):
            for key, value in name.items():
                self.setup(key, value)
        else:
            if 'post' == name:
                self.freshPostData = True
            self.config[name] = value

    def _changeState(self, name, value):
        """
        Configure internal pycurl instance before request
        """

        if isinstance(name, int):
            self.curl.setopt(name, value)
        # TODO: is it possible that dict passed to changeState?
        elif isinstance(name, dict):
            for key in name:
                self.setup(key, name[key])

        if 'post' == name:
            if value:
                self.curl.setopt(pycurl.POSTFIELDS, urllib.urlencode(value))
            else:
                self.curl.setopt(pycurl.HTTPGET,1)
        elif 'logfile' == name:
            self.logFile = value
        elif 'url' == name:
            self.curl.setopt(pycurl.URL, str(value))
        elif 'proxy' == name:
            if value:
                proxy = value
            else:
                proxy = ''
            self.curl.setopt(pycurl.PROXY, proxy)
        elif 'timeout' == name:
            self.curl.setopt(pycurl.TIMEOUT, value)
        elif 'connect_timeout' == name:
            self.curl.setopt(pycurl.CONNECTTIMEOUT, value)
        elif 'referer' == name:
            self.curl.setopt(pycurl.REFERER, str(value))
        elif 'cookies' == name:
            for name, value in value.items():
                self.register_cookie(name, value)
        elif 'autocookies' == name:
            pass
        elif 'nobody' == name:
            if True == value:
                self.nobody = True
        elif 'nohead' == name:
            if True == value:
                self.nohead = True
        elif 'maxsize' == name:
            self.maxsize = value
        elif 'redirect' == name:
            self.curl.setopt(pycurl.FOLLOWLOCATION, value)
        elif 'max_redirects' == name:
            self.curl.setopt(pycurl.MAXREDIRS, value)
        elif 'userpwd' == name:
            self.curl.setopt(pycurl.USERPWD, value)
        elif 'bodyCallback' == name:
            if isinstance(name, (list, tuple)):
                self._bodyCallbacks = value
            else:
                self._bodyCallbacks.append(value)
        elif 'user_agent' == name:
            self.curl.setopt(pycurl.USERAGENT, value)
        elif 'headers' == name:
            self.curl.setopt(pycurl.HTTPHEADER, ['%s: %s' % (a, b) for a, b in value.iteritems()])
        elif 'autoreferer' == name:
            if not 'referer' in self.config:
                if not self.oldUrl is None:
                    self.curl.setopt(pycurl.REFERER, str(self.oldUrl))
        elif 'unicode' == name:
            self.unicode = bool(value)
        elif 'use_tidy' == name:
            self.use_tidy = bool(value)
        elif 'gzip' == name:
            self.gzip = value
        elif 'debug' == name:
            self.curl.setopt(pycurl.VERBOSE, value)
        else:
            raise Exception, "unknown option: %s" % name

    def _prepare(self):
        """Prepare for request"""

        self.curl = pycurl.Curl()
        self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
        self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)
        self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
        self.curl.setopt(pycurl.TIMEOUT, self.timeout)
        self.curl.setopt(pycurl.CONNECTTIMEOUT, self.timeout)
        self.curl.setopt(pycurl.MAXREDIRS, self.max_redirects)
        self.curl.setopt(pycurl.NOSIGNAL, 1)
        self.curl.setopt(pycurl.WRITEFUNCTION, self._bodyCallback)
        self.curl.setopt(pycurl.HEADERFUNCTION, self._headCallback)
        self.curl.setopt(pycurl.DEBUGFUNCTION, self._debug_callback)
        #self.curl.setopt(pycurl.VERBOSE, True)

        self.head = ''
        self.body = ''
        self.headers = {}
        self.cookies = {}
        self.maxsize = 0
        self.nobody = False
        self.nohead = False
        self.lastError = ''#pycurl.CURLE_OK
        self.encoding = None

        if not 'user_agent' in self.config:
            self.config['user_agent'] = self.default_user_agent

        # Set up default headers if they do not exist
        headers = self.config.setdefault('headers', {})
        for header, value in self.default_headers.iteritems():
            if not header in headers:
                headers[header] = value
        if self.config.get('gzip'):
            if not header in headers:
                headers['Accept-Encoding'] = 'gzip'

        for name, value in self.config.items():
            self._changeState(name, value)

        # If autocookies mode is enabled then use all registered cookies for this domain
        # else use cookies given in setup calls (if any)
        cookies = ()
        if self.config.get('autocookies'):
            cookies = self.get_registered_cookies()
        elif self.config.get('cookies'):
            cookies = self.config['cookies']
        if cookies:
            parts = []
            for name, value in cookies.iteritems():
                parts.append('%s=%s;' % (urllib.quote_plus(name),
                                           urllib.quote_plus(value)))
            self.curl.setopt(pycurl.COOKIE, ''.join(parts))

        # If we query new url we must reset old post and cookes information
        # if they was not defined for new url becouse their values
        # are still stored in the self.config
        if self.oldUrl != self.config['url']:
            if not self.freshPostData:
                self.curl.setopt(pycurl.HTTPGET, 1)

        self.freshPostData = False

    def run(self):
        """Do request"""

        self._prepare()
        try:
            self.curl.perform()
        except pycurl.error, err:
            # CURLE_WRITE_ERROR
            # An error occurred when writing received data to a local file, or
            # an error was returned to libcurl from a write callback.
            # This is expected error and we should ignore it
            if 23 == err[0]:
                pass
            else:
                self._finish()
                self.lastError = err
                # 6 - could not resolve host
                # 47 - too many redirects
                # 52 - nothing was returned from the server
                # 58 - problem with the local client certificate
                # 59 - couldn't use specified cipher
                # 60 - problem with the CA cert (path? access rights?)
                if err[0] in (6, 47, 52, 58, 59, 60):
                    raise SiteError, err
                raise Error, err
        self._finish()

    def _finish(self):
        """Process query result"""

        self.oldUrl = self.config['url']
        if self.maxsize:
            self.body = self.body[0:self.maxsize]
        if self.logFile:
            open(self.logFile, 'w').write(
                self.config['url'] + '\n' + \
                self.curl.errstr() + '\n' + \
                self.head + '\n' + self.body)
        for line in re.split('\r?\n', self.head):
            try:
                name, value = line.split(': ', 1)
                if 'Set-Cookie' == name:
                    match = re.search('^([^=]+)=([^;]+)*', value)
                    if match:
                        self.cookies[match.group(1)] = match.group(2)
                else:
                    self.headers[name] = value
            except ValueError:
                pass
        for name, value in self.cookies.iteritems():
            self.register_cookie(name, value)
        if self.headers.get('Content-Encoding') == 'gzip':
            import StringIO
            import gzip
            gzipper = gzip.GzipFile(fileobj=StringIO.StringIO(self.body))
            self.body = gzipper.read()
        if self.unicode:
            self.decode_body()
        if self.use_tidy:
            if not self.unicode:
                raise Exception('`use_tidy` options requires `unicode` option but it is off now')
            else:
                self.apply_tidy()
        #self.curl.close()

    def decode_body(self):
        encoding = detect_encoding(self.body, headers=self.headers)
        self.encoding = encoding
        if encoding:
            self.body = self.body.decode(encoding)
        else:
            # TODO: choose the proper way for handling case of unknown encoding
            raise Exception('Could not determine encoding')
            #self.body = self.body.decode('utf-8', 'ignore')

    def apply_tidy(self):
        print 'fuck'
        import tidy
        self.original_body = self.body
        data = self.body.encode('utf-8')
        options = dict(
            output_xhtml=1,
            show_body_only=0,
            force_output=1,
            char_encoding='utf8')
        data = str(tidy.parseString(data, **options))
        self.body = data.decode('utf-8')

    def getinfo(self,key):
        return self.curl.getinfo(getattr(pycurl, key))

    def errstr(self):
        """get request error text"""

        self.curl.errstr()

    def getConfig(self, name):
        try:
            return self.config[name]
        except KeyError:
            return ''

    def code(self):
        return self.getinfo('RESPONSE_CODE')

    def get_current_host(self):
        domain = urlsplit(self.config['url'])[1]
        host = domain.rsplit('.', 1)[-1]
        return host

    def register_cookie(self, name, value):
        self.cookies_map.setdefault(self.get_current_host(), {})[name] = value

    def get_registered_cookies(self):
        return self.cookies_map.get(self.get_current_host(), {})

    @property
    def soup(self):
        from BeautifulSoup import BeautifulSoup
        return BeautifulSoup(self.body)

useragents = (
    'Mozilla/4.0 (compatible; MSIE 6.0; MSN 2.5; Windows 98)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 1.1.4322; Media Center PC 4.0; .NET CLR 2.0.50727)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322)',
    'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)',
    'Mozilla/4.0 (compatible; MSIE 7.0b; Win32)',
    'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; Arcor 5.005; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; YPC 3.0.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.8) Gecko/20050511',
    'Mozilla/5.0 (X11; U; Linux i686; cs-CZ; rv:1.7.12) Gecko/20050929',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
    'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.7.8) Gecko/20050609 Firefox/1.0.4',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.9) Gecko/20050711 Firefox/1.0.5',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.10) Gecko/20050716 Firefox/1.0.6',
    'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl; rv:1.8) Gecko/20051107 Firefox/1.5',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.1) Gecko/20060111 Firefox/1.5.0.1',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.2) Gecko/20060308 Firefox/1.5.0.2',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3',
    'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.6) Gecko/20060808 Fedora/1.5.0.6-2.fc5 Firefox/1.5.0.6 pango-text',
    'Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.0.7) Gecko/20060909 Firefox/1.5.0.7',
    'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.8.1) Gecko/20060601 Firefox/2.0 (Ubuntu-edgy)',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20070220 Firefox/2.0.0.2',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20070221 SUSE/2.0.0.2-6.1 Firefox/2.0.0.2',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9',
    'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9',
    'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.9a1) Gecko/20061204 GranParadiso/3.0a1',
    'Opera/8.0 (X11; Linux i686; U; cs)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.50',
    'Mozilla/5.0 (Windows NT 5.1; U; en) Opera 8.50',
    'Opera/8.51 (Windows NT 5.1; U; en)',
    'Opera/9.0 (Windows NT 5.1; U; en)',
    'Opera/9.01 (X11; Linux i686; U; en)',
    'Opera/9.02 (Windows NT 5.1; U; en)',
    'Opera/9.10 (Windows NT 5.1; U; en)',
    'Opera/9.23 (Windows NT 5.1; U; ru)',
)
	import pycurl
	import urllib
	import copy
	import re
	import signal
	import os
	import random
	from urlparse import urlsplit

	from libpy.html import detect_encoding

	# TODO:
	# fetching the binary content even with unicode=True fails - PIL couldn't load png file
	# fetched with Grab

	# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
	# the libcurl tutorial for more info.
	# Comments: http://curl.haxx.se/mail/curlpython-2005-06/0004.html
	try:
	import signal
	from signal import SIGPIPE, SIG_IGN
	signal.signal(signal.SIGPIPE, signal.SIG_IGN)
	except ImportError:
	pass
	except ValueError:
	# Do this to ignore ValueError: signal only works in main thread
	# in python 2.5 WTF???
	pass

	class Error(pycurl.error):
	"""Used to indicate network error. The same as pycrl.errror"""

	class SiteError(Error):
	"""
	Used to indicate error of the remote resource
	It is usefull for example when we query server which name can not
	be resolved
	"""

	def get(url, config=None, soup=False):
	"""Simple function for fetching url using grab instance"""

	curl = Grab()
	curl.setup('url', url)
	if config:
	curl.setup(config)
	curl.run()
	if soup:
	return curl.soup
	else:
	return curl.body


	class Grab:
	"""Fancy wrapper for pycurl library"""

	def __init__(self):
	self.timeout = 20
	self.logFile = None
	self.config = {}
	self._bodyCallbacks = []
	self.debug = False
	self.lastError = None
	self.freshPostData = False
	self.cookies_map = {}
	self.oldUrl = None
	self.debug = False
	self.auto_cookies = False
	self.generate_client_profile()
	self.head = ''
	self.body = ''
	self.headers = {}
	self.cookies = {}
	self.unicode = True
	self.encoding = None
	self.use_tidy = False
	self.out_headers = None
	self.max_redirects = 5

	def generate_client_profile(self):
	self.default_headers = {
	'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,/;q=0.5',
	'Accept-Language': 'ru,en-us;q=0.%(x)d,en;q=0.3;%(lang)s' % {'x': random.randint(5, 9),
	'lang': random.choice(['ua', 'gb', 'uk'])},
	#'Accept-Encoding': 'gzip,compress;q=0.%(x)d,deflate;q=0' % {'x': random.randint(5, 9)},
	'Accept-Charset': 'utf-8,windows-1251;q=0.%(x)d,*;q=0.%(x)d' % {'x': random.randint(5, 9)}
	}
	#print self.default_headers
	self.default_user_agent = random.choice(useragents)
	#print self.default_user_agent

	def _bodyCallback(self, data):
	"""Used to process anser body"""

	if self.nobody:
	return 0
	else:
	self.body = self.body + data
	if self.maxsize:
	if len(self.body) > self.maxsize:
	return 0
	if self._bodyCallbacks:
	for callback in self._bodyCallbacks:
	if not callback(data):
	return 0
	return len(data)

	def _headCallback(self, data):
	"""Used to process answer headers"""

	if self.nohead:
	return 0
	else:
	self.head = self.head + data
	return len(data)

	def _debug_callback(self, type, data):
	if type == 2: # pycurl.CURLINFO_HEADER_OUT: WTF?? pycurl.HEADER_OUT is invalid
	self.out_headers = data


	def request(self):
	"""Run prepared curl request"""

	self.curl.perform()
	self.curl.close()

	def setup(self, name, value = None):
	"""
	Configure curl request. Arguments variants:
	1. name - option name, value - option value
	2. name is dictionary, value is None
	"""

	if isinstance(name, dict):
	for key, value in name.items():
	self.setup(key, value)
	else:
	if 'post' == name:
	self.freshPostData = True
	self.config[name] = value

	def _changeState(self, name, value):
	"""
	Configure internal pycurl instance before request
	"""

	if isinstance(name, int):
	self.curl.setopt(name, value)
	# TODO: is it possible that dict passed to changeState?
	elif isinstance(name, dict):
	for key in name:
	self.setup(key, name[key])

	if 'post' == name:
	if value:
	self.curl.setopt(pycurl.POSTFIELDS, urllib.urlencode(value))
	else:
	self.curl.setopt(pycurl.HTTPGET,1)
	elif 'logfile' == name:
	self.logFile = value
	elif 'url' == name:
	self.curl.setopt(pycurl.URL, str(value))
	elif 'proxy' == name:
	if value:
	proxy = value
	else:
	proxy = ''
	self.curl.setopt(pycurl.PROXY, proxy)
	elif 'timeout' == name:
	self.curl.setopt(pycurl.TIMEOUT, value)
	elif 'connect_timeout' == name:
	self.curl.setopt(pycurl.CONNECTTIMEOUT, value)
	elif 'referer' == name:
	self.curl.setopt(pycurl.REFERER, str(value))
	elif 'cookies' == name:
	for name, value in value.items():
	self.register_cookie(name, value)
	elif 'autocookies' == name:
	pass
	elif 'nobody' == name:
	if True == value:
	self.nobody = True
	elif 'nohead' == name:
	if True == value:
	self.nohead = True
	elif 'maxsize' == name:
	self.maxsize = value
	elif 'redirect' == name:
	self.curl.setopt(pycurl.FOLLOWLOCATION, value)
	elif 'max_redirects' == name:
	self.curl.setopt(pycurl.MAXREDIRS, value)
	elif 'userpwd' == name:
	self.curl.setopt(pycurl.USERPWD, value)
	elif 'bodyCallback' == name:
	if isinstance(name, (list, tuple)):
	self._bodyCallbacks = value
	else:
	self._bodyCallbacks.append(value)
	elif 'user_agent' == name:
	self.curl.setopt(pycurl.USERAGENT, value)
	elif 'headers' == name:
	self.curl.setopt(pycurl.HTTPHEADER, ['%s: %s' % (a, b) for a, b in value.iteritems()])
	elif 'autoreferer' == name:
	if not 'referer' in self.config:
	if not self.oldUrl is None:
	self.curl.setopt(pycurl.REFERER, str(self.oldUrl))
	elif 'unicode' == name:
	self.unicode = bool(value)
	elif 'use_tidy' == name:
	self.use_tidy = bool(value)
	elif 'gzip' == name:
	self.gzip = value
	elif 'debug' == name:
	self.curl.setopt(pycurl.VERBOSE, value)
	else:
	raise Exception, "unknown option: %s" % name

	def _prepare(self):
	"""Prepare for request"""

	self.curl = pycurl.Curl()
	self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
	self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)
	self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
	self.curl.setopt(pycurl.TIMEOUT, self.timeout)
	self.curl.setopt(pycurl.CONNECTTIMEOUT, self.timeout)
	self.curl.setopt(pycurl.MAXREDIRS, self.max_redirects)
	self.curl.setopt(pycurl.NOSIGNAL, 1)
	self.curl.setopt(pycurl.WRITEFUNCTION, self._bodyCallback)
	self.curl.setopt(pycurl.HEADERFUNCTION, self._headCallback)
	self.curl.setopt(pycurl.DEBUGFUNCTION, self._debug_callback)
	#self.curl.setopt(pycurl.VERBOSE, True)

	self.head = ''
	self.body = ''
	self.headers = {}
	self.cookies = {}
	self.maxsize = 0
	self.nobody = False
	self.nohead = False
	self.lastError = ''#pycurl.CURLE_OK
	self.encoding = None

	if not 'user_agent' in self.config:
	self.config['user_agent'] = self.default_user_agent

	# Set up default headers if they do not exist
	headers = self.config.setdefault('headers', {})
	for header, value in self.default_headers.iteritems():
	if not header in headers:
	headers[header] = value
	if self.config.get('gzip'):
	if not header in headers:
	headers['Accept-Encoding'] = 'gzip'

	for name, value in self.config.items():
	self._changeState(name, value)

	# If autocookies mode is enabled then use all registered cookies for this domain
	# else use cookies given in setup calls (if any)
	cookies = ()
	if self.config.get('autocookies'):
	cookies = self.get_registered_cookies()
	elif self.config.get('cookies'):
	cookies = self.config['cookies']
	if cookies:
	parts = []
	for name, value in cookies.iteritems():
	parts.append('%s=%s;' % (urllib.quote_plus(name),
	urllib.quote_plus(value)))
	self.curl.setopt(pycurl.COOKIE, ''.join(parts))

	# If we query new url we must reset old post and cookes information
	# if they was not defined for new url becouse their values
	# are still stored in the self.config
	if self.oldUrl != self.config['url']:
	if not self.freshPostData:
	self.curl.setopt(pycurl.HTTPGET, 1)

	self.freshPostData = False

	def run(self):
	"""Do request"""

	self._prepare()
	try:
	self.curl.perform()
	except pycurl.error, err:
	# CURLE_WRITE_ERROR
	# An error occurred when writing received data to a local file, or
	# an error was returned to libcurl from a write callback.
	# This is expected error and we should ignore it
	if 23 == err[0]:
	pass
	else:
	self._finish()
	self.lastError = err
	# 6 - could not resolve host
	# 47 - too many redirects
	# 52 - nothing was returned from the server
	# 58 - problem with the local client certificate
	# 59 - couldn't use specified cipher
	# 60 - problem with the CA cert (path? access rights?)
	if err[0] in (6, 47, 52, 58, 59, 60):
	raise SiteError, err
	raise Error, err
	self._finish()

	def _finish(self):
	"""Process query result"""

	self.oldUrl = self.config['url']
	if self.maxsize:
	self.body = self.body[0:self.maxsize]
	if self.logFile:
	open(self.logFile, 'w').write(
	self.config['url'] + '\n' + \
	self.curl.errstr() + '\n' + \
	self.head + '\n' + self.body)
	for line in re.split('\r?\n', self.head):
	try:
	name, value = line.split(': ', 1)
	if 'Set-Cookie' == name:
	match = re.search('^([^=]+)=([^;]+)*', value)
	if match:
	self.cookies[match.group(1)] = match.group(2)
	else:
	self.headers[name] = value
	except ValueError:
	pass
	for name, value in self.cookies.iteritems():
	self.register_cookie(name, value)
	if self.headers.get('Content-Encoding') == 'gzip':
	import StringIO
	import gzip
	gzipper = gzip.GzipFile(fileobj=StringIO.StringIO(self.body))
	self.body = gzipper.read()
	if self.unicode:
	self.decode_body()
	if self.use_tidy:
	if not self.unicode:
	raise Exception('`use_tidy` options requires `unicode` option but it is off now')
	else:
	self.apply_tidy()
	#self.curl.close()

	def decode_body(self):
	encoding = detect_encoding(self.body, headers=self.headers)
	self.encoding = encoding
	if encoding:
	self.body = self.body.decode(encoding)
	else:
	# TODO: choose the proper way for handling case of unknown encoding
	raise Exception('Could not determine encoding')
	#self.body = self.body.decode('utf-8', 'ignore')

	def apply_tidy(self):
	print 'fuck'
	import tidy
	self.original_body = self.body
	data = self.body.encode('utf-8')
	options = dict(
	output_xhtml=1,
	show_body_only=0,
	force_output=1,
	char_encoding='utf8')
	data = str(tidy.parseString(data, **options))
	self.body = data.decode('utf-8')

	def getinfo(self,key):
	return self.curl.getinfo(getattr(pycurl, key))

	def errstr(self):
	"""get request error text"""

	self.curl.errstr()

	def getConfig(self, name):
	try:
	return self.config[name]
	except KeyError:
	return ''

	def code(self):
	return self.getinfo('RESPONSE_CODE')

	def get_current_host(self):
	domain = urlsplit(self.config['url'])[1]
	host = domain.rsplit('.', 1)[-1]
	return host

	def register_cookie(self, name, value):
	self.cookies_map.setdefault(self.get_current_host(), {})[name] = value

	def get_registered_cookies(self):
	return self.cookies_map.get(self.get_current_host(), {})

	@property
	def soup(self):
	from BeautifulSoup import BeautifulSoup
	return BeautifulSoup(self.body)

	useragents = (
	'Mozilla/4.0 (compatible; MSIE 6.0; MSN 2.5; Windows 98)',
	'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
	'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
	'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
	'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 1.1.4322; Media Center PC 4.0; .NET CLR 2.0.50727)',
	'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322)',
	'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)',
	'Mozilla/4.0 (compatible; MSIE 7.0b; Win32)',
	'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)',
	'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; Arcor 5.005; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',
	'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; YPC 3.0.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
	'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
	'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.8) Gecko/20050511',
	'Mozilla/5.0 (X11; U; Linux i686; cs-CZ; rv:1.7.12) Gecko/20050929',
	'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
	'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.7.8) Gecko/20050609 Firefox/1.0.4',
	'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.9) Gecko/20050711 Firefox/1.0.5',
	'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.10) Gecko/20050716 Firefox/1.0.6',
	'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7',
	'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl; rv:1.8) Gecko/20051107 Firefox/1.5',
	'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.1) Gecko/20060111 Firefox/1.5.0.1',
	'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.2) Gecko/20060308 Firefox/1.5.0.2',
	'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3',
	'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4',
	'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.6) Gecko/20060808 Fedora/1.5.0.6-2.fc5 Firefox/1.5.0.6 pango-text',
	'Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.0.7) Gecko/20060909 Firefox/1.5.0.7',
	'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.8.1) Gecko/20060601 Firefox/2.0 (Ubuntu-edgy)',
	'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1',
	'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20070220 Firefox/2.0.0.2',
	'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20070221 SUSE/2.0.0.2-6.1 Firefox/2.0.0.2',
	'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9',
	'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9',
	'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.9a1) Gecko/20061204 GranParadiso/3.0a1',
	'Opera/8.0 (X11; Linux i686; U; cs)',
	'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.50',
	'Mozilla/5.0 (Windows NT 5.1; U; en) Opera 8.50',
	'Opera/8.51 (Windows NT 5.1; U; en)',
	'Opera/9.0 (Windows NT 5.1; U; en)',
	'Opera/9.01 (X11; Linux i686; U; en)',
	'Opera/9.02 (Windows NT 5.1; U; en)',
	'Opera/9.10 (Windows NT 5.1; U; en)',
	'Opera/9.23 (Windows NT 5.1; U; ru)',
	)