jvanasco/encodings_test.py

## encodings_test.py
 # -*- coding: utf-8 -*-
from __future__ import print_function
"""
This test harness showcases an odd scenario when providing compatibility
with Python2 and Python3 data.

The input to a function is a URL, which in Python2 might have been:

    url_unicode = u'http://➡.ws/♥'
    url_string =   'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
    url_bytes =    'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'    # b prefix is allowed

or in Python3 as:

    url_unicode =  'http://➡.ws/♥'                         # u prefix is allowed
    url_string =   'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
    url_bytes =   b'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'

While these all represent the same url in different encodings, it is a bit of
a hassle to ensure the correct decoding/encoding of the input when it is unknown
across both Python2 and Python3  (see `test_unknown_input`).

This should be an edgecase for most people.  This only popped up because a test
suite failed in a python2-3 port that ensured a handful of encodings/decodings
would create the same output, as it was in a Service Oriented Architecture
application and the input could be from another system.


Expected results

    python2 encodings_test.py

        pass: TestEncodings.test_unknown_input
              TestEncodings.test_w3lib_to_unicode__fail3_expected
        fail: TestEncodings.test_w3lib_bypass_1__fail23_expected
              TestEncodings.test_w3lib_bypass_2__fail23_expected

    python3 encodings_test.py

        pass: TestEncodings.test_unknown_input
        fail: TestEncodings.test_w3lib_to_unicode__fail3_expected
              TestEncodings.test_w3lib_bypass_1__fail23_expected
              TestEncodings.test_w3lib_bypass_2__fail23_expected

"""

import unittest
import six
import pdb
from w3lib.url import safe_url_string


# ==============================================================================


def is_ascii(s):
    return all(ord(c) < 128 for c in s)


def w3lib_to_unicode(text, encoding=None, errors='strict'):
    # this is lifted as-is from w3lib; https://github.com/scrapy/w3lib/blob/master/w3lib/util.py
    # there is is 'to_unicode'
    # this is where the error in safe_url_string traces down to
    # included for test prurposes
    """Return the unicode representation of a bytes object `text`. If `text`
    is already an unicode object, return it as-is."""
    if isinstance(text, six.text_type):
        return text
    if not isinstance(text, (bytes, six.text_type)):
        raise TypeError('to_unicode must receive a bytes, str or unicode '
                        'object, got %s' % type(text).__name__)
    if encoding is None:
        encoding = 'utf-8'
    return text.decode(encoding, errors)


class TestEncodings(unittest.TestCase):

    url_unicode = u'http://➡.ws/♥'
    url_string =   'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
    url_bytes =   b'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
    url_safe = 'http://xn--hgi.ws/%E2%99%A5'
    url_bad_1 = 'http://â\x9e¡.ws/â\x99¥'

    inputs = (('unicode', url_unicode, ),
              ('string', url_string, ),
              ('url_bytes', url_bytes, ),
              )

    def test_unknown_input(self):
        # in python2 there is a difference between unicode and str; in python3 there is not
        # in python3 there is a difference between str and bytes; in python2 there is not
        # aka, fml
        # print("test_unknown_input")
        for (input_type, input) in self.inputs:
            _url_candidate = input
            if six.PY3:
                if isinstance(input, bytes):
                    _url_candidate = input
                else:
                    _decoded = input.encode('raw_unicode_escape')
                    _decoded_encoded = input.encode('raw_unicode_escape').decode()
                    if (is_ascii(_decoded_encoded)):
                        # input: self.url_unicode
                        _url_candidate = input.encode('utf-8')
                    else:
                        # input: self.url_string
                        _url_candidate = _decoded_encoded
            else:
                _url_candidate = input.encode('utf-8') if isinstance(input, unicode) else input

            result = safe_url_string(_url_candidate)
            self.assertEqual(result, self.url_safe)


    def test_w3lib_to_unicode__fail3_expected(self):
        fails = []
        for (input_type, input) in self.inputs:
            _url_candidate = input
            result = w3lib_to_unicode(input)
            if result != self.url_unicode:
                fails.append((input_type, input, result))
        if fails:
            if False:
                print("test_w3lib_to_unicode")
                for fail in fails:
                    print(fail[0], fail[1])
                    pdb.set_trace()
            raise ValueError("result != self.url_unicode: %s" % fails)

    def test_w3lib_bypass_1__fail23_expected(self):
        """
        in this example:
            unicode b'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
            string b'http://\xc3\xa2\xc2\x9e\xc2\xa1.ws/\xc3\xa2\xc2\x99\xc2\xa5'
            url_bytes http://➡.ws/♥
        """
        fails = []
        for (input_type, input) in self.inputs:
            _decoded = None
            if isinstance(input, six.text_type):
                _decoded = input.encode('utf-8')
            elif isinstance(input, six.binary_type):
                try:
                    _decoded = input.decode()
                except UnicodeDecodeError as exc:  # Py2 catch
                    _decoded = input
            if _decoded != self.url_unicode:
                fails.append((input_type, input, _decoded))
        if fails:
            if False:
                print("test_w3lib_bypass_1")
                for fail in fails:
                    print(fail[0], fail[1])
                    pdb.set_trace()
            raise ValueError("result != self.url_unicode: %s" % fails)

    def test_w3lib_bypass_2__fail23_expected(self):
        """
        in this example:
            unicode http://➡.ws/♥
            string http://â¡.ws/â¥
            url_bytes http://➡.ws/♥
        """
        fails = []
        for (input_type, input) in self.inputs:
            _decoded = None
            if isinstance(input, six.text_type):
                _decoded = input.encode('utf-8')
            elif isinstance(input, six.binary_type):
                try:
                    _decoded = input.decode()
                except UnicodeDecodeError as exc:  # Py2 catch
                    _decoded = input
            if _decoded != self.url_unicode:
                fails.append((input_type, input, _decoded))
        if fails:
            if False:
                print("test_w3lib_bypass_2")
                for fail in fails:
                    print(fail[0], fail[1])
                    pdb.set_trace()
            raise ValueError("result != self.url_unicode: %s" % fails)


if __name__ == '__main__':
    unittest.main()
	# -- coding: utf-8 --
	from __future__ import print_function
	"""
	This test harness showcases an odd scenario when providing compatibility
	with Python2 and Python3 data.

	The input to a function is a URL, which in Python2 might have been:

	url_unicode = u'http://➡.ws/♥'
	url_string = 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
	url_bytes = 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5' # b prefix is allowed

	or in Python3 as:

	url_unicode = 'http://➡.ws/♥' # u prefix is allowed
	url_string = 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
	url_bytes = b'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'

	While these all represent the same url in different encodings, it is a bit of
	a hassle to ensure the correct decoding/encoding of the input when it is unknown
	across both Python2 and Python3 (see `test_unknown_input`).

	This should be an edgecase for most people. This only popped up because a test
	suite failed in a python2-3 port that ensured a handful of encodings/decodings
	would create the same output, as it was in a Service Oriented Architecture
	application and the input could be from another system.


	Expected results

	python2 encodings_test.py

	pass: TestEncodings.test_unknown_input
	TestEncodings.test_w3lib_to_unicode__fail3_expected
	fail: TestEncodings.test_w3lib_bypass_1__fail23_expected
	TestEncodings.test_w3lib_bypass_2__fail23_expected

	python3 encodings_test.py

	pass: TestEncodings.test_unknown_input
	fail: TestEncodings.test_w3lib_to_unicode__fail3_expected
	TestEncodings.test_w3lib_bypass_1__fail23_expected
	TestEncodings.test_w3lib_bypass_2__fail23_expected

	"""

	import unittest
	import six
	import pdb
	from w3lib.url import safe_url_string


	# ==============================================================================


	def is_ascii(s):
	return all(ord(c) < 128 for c in s)


	def w3lib_to_unicode(text, encoding=None, errors='strict'):
	# this is lifted as-is from w3lib; https://github.com/scrapy/w3lib/blob/master/w3lib/util.py
	# there is is 'to_unicode'
	# this is where the error in safe_url_string traces down to
	# included for test prurposes
	"""Return the unicode representation of a bytes object `text`. If `text`
	is already an unicode object, return it as-is."""
	if isinstance(text, six.text_type):
	return text
	if not isinstance(text, (bytes, six.text_type)):
	raise TypeError('to_unicode must receive a bytes, str or unicode '
	'object, got %s' % type(text).__name__)
	if encoding is None:
	encoding = 'utf-8'
	return text.decode(encoding, errors)


	class TestEncodings(unittest.TestCase):

	url_unicode = u'http://➡.ws/♥'
	url_string = 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
	url_bytes = b'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
	url_safe = 'http://xn--hgi.ws/%E2%99%A5'
	url_bad_1 = 'http://â\x9e¡.ws/â\x99¥'

	inputs = (('unicode', url_unicode, ),
	('string', url_string, ),
	('url_bytes', url_bytes, ),
	)

	def test_unknown_input(self):
	# in python2 there is a difference between unicode and str; in python3 there is not
	# in python3 there is a difference between str and bytes; in python2 there is not
	# aka, fml
	# print("test_unknown_input")
	for (input_type, input) in self.inputs:
	_url_candidate = input
	if six.PY3:
	if isinstance(input, bytes):
	_url_candidate = input
	else:
	_decoded = input.encode('raw_unicode_escape')
	_decoded_encoded = input.encode('raw_unicode_escape').decode()
	if (is_ascii(_decoded_encoded)):
	# input: self.url_unicode
	_url_candidate = input.encode('utf-8')
	else:
	# input: self.url_string
	_url_candidate = _decoded_encoded
	else:
	_url_candidate = input.encode('utf-8') if isinstance(input, unicode) else input

	result = safe_url_string(_url_candidate)
	self.assertEqual(result, self.url_safe)


	def test_w3lib_to_unicode__fail3_expected(self):
	fails = []
	for (input_type, input) in self.inputs:
	_url_candidate = input
	result = w3lib_to_unicode(input)
	if result != self.url_unicode:
	fails.append((input_type, input, result))
	if fails:
	if False:
	print("test_w3lib_to_unicode")
	for fail in fails:
	print(fail[0], fail[1])
	pdb.set_trace()
	raise ValueError("result != self.url_unicode: %s" % fails)

	def test_w3lib_bypass_1__fail23_expected(self):
	"""
	in this example:
	unicode b'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
	string b'http://\xc3\xa2\xc2\x9e\xc2\xa1.ws/\xc3\xa2\xc2\x99\xc2\xa5'
	url_bytes http://➡.ws/♥
	"""
	fails = []
	for (input_type, input) in self.inputs:
	_decoded = None
	if isinstance(input, six.text_type):
	_decoded = input.encode('utf-8')
	elif isinstance(input, six.binary_type):
	try:
	_decoded = input.decode()
	except UnicodeDecodeError as exc: # Py2 catch
	_decoded = input
	if _decoded != self.url_unicode:
	fails.append((input_type, input, _decoded))
	if fails:
	if False:
	print("test_w3lib_bypass_1")
	for fail in fails:
	print(fail[0], fail[1])
	pdb.set_trace()
	raise ValueError("result != self.url_unicode: %s" % fails)

	def test_w3lib_bypass_2__fail23_expected(self):
	"""
	in this example:
	unicode http://➡.ws/♥
	string http://â¡.ws/â¥
	url_bytes http://➡.ws/♥
	"""
	fails = []
	for (input_type, input) in self.inputs:
	_decoded = None
	if isinstance(input, six.text_type):
	_decoded = input.encode('utf-8')
	elif isinstance(input, six.binary_type):
	try:
	_decoded = input.decode()
	except UnicodeDecodeError as exc: # Py2 catch
	_decoded = input
	if _decoded != self.url_unicode:
	fails.append((input_type, input, _decoded))
	if fails:
	if False:
	print("test_w3lib_bypass_2")
	for fail in fails:
	print(fail[0], fail[1])
	pdb.set_trace()
	raise ValueError("result != self.url_unicode: %s" % fails)


	if __name__ == '__main__':
	unittest.main()