Skip to content

Instantly share code, notes, and snippets.

@jvanasco
Created May 8, 2019 15:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jvanasco/ae35ee9347c444d11ebd05844d1b2de4 to your computer and use it in GitHub Desktop.
Save jvanasco/ae35ee9347c444d11ebd05844d1b2de4 to your computer and use it in GitHub Desktop.
a test harness showing an edge case scenario with bytes and string encoding from a python2 to python3 port.
# -*- coding: utf-8 -*-
from __future__ import print_function
"""
This test harness showcases an odd scenario when providing compatibility
with Python2 and Python3 data.
The input to a function is a URL, which in Python2 might have been:
url_unicode = u'http://➡.ws/♥'
url_string = 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
url_bytes = 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5' # b prefix is allowed
or in Python3 as:
url_unicode = 'http://➡.ws/♥' # u prefix is allowed
url_string = 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
url_bytes = b'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
While these all represent the same url in different encodings, it is a bit of
a hassle to ensure the correct decoding/encoding of the input when it is unknown
across both Python2 and Python3 (see `test_unknown_input`).
This should be an edgecase for most people. This only popped up because a test
suite failed in a python2-3 port that ensured a handful of encodings/decodings
would create the same output, as it was in a Service Oriented Architecture
application and the input could be from another system.
Expected results
python2 encodings_test.py
pass: TestEncodings.test_unknown_input
TestEncodings.test_w3lib_to_unicode__fail3_expected
fail: TestEncodings.test_w3lib_bypass_1__fail23_expected
TestEncodings.test_w3lib_bypass_2__fail23_expected
python3 encodings_test.py
pass: TestEncodings.test_unknown_input
fail: TestEncodings.test_w3lib_to_unicode__fail3_expected
TestEncodings.test_w3lib_bypass_1__fail23_expected
TestEncodings.test_w3lib_bypass_2__fail23_expected
"""
import unittest
import six
import pdb
from w3lib.url import safe_url_string
# ==============================================================================
def is_ascii(s):
return all(ord(c) < 128 for c in s)
def w3lib_to_unicode(text, encoding=None, errors='strict'):
# this is lifted as-is from w3lib; https://github.com/scrapy/w3lib/blob/master/w3lib/util.py
# there is is 'to_unicode'
# this is where the error in safe_url_string traces down to
# included for test prurposes
"""Return the unicode representation of a bytes object `text`. If `text`
is already an unicode object, return it as-is."""
if isinstance(text, six.text_type):
return text
if not isinstance(text, (bytes, six.text_type)):
raise TypeError('to_unicode must receive a bytes, str or unicode '
'object, got %s' % type(text).__name__)
if encoding is None:
encoding = 'utf-8'
return text.decode(encoding, errors)
class TestEncodings(unittest.TestCase):
url_unicode = u'http://➡.ws/♥'
url_string = 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
url_bytes = b'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
url_safe = 'http://xn--hgi.ws/%E2%99%A5'
url_bad_1 = 'http://â\x9e¡.ws/â\x99¥'
inputs = (('unicode', url_unicode, ),
('string', url_string, ),
('url_bytes', url_bytes, ),
)
def test_unknown_input(self):
# in python2 there is a difference between unicode and str; in python3 there is not
# in python3 there is a difference between str and bytes; in python2 there is not
# aka, fml
# print("test_unknown_input")
for (input_type, input) in self.inputs:
_url_candidate = input
if six.PY3:
if isinstance(input, bytes):
_url_candidate = input
else:
_decoded = input.encode('raw_unicode_escape')
_decoded_encoded = input.encode('raw_unicode_escape').decode()
if (is_ascii(_decoded_encoded)):
# input: self.url_unicode
_url_candidate = input.encode('utf-8')
else:
# input: self.url_string
_url_candidate = _decoded_encoded
else:
_url_candidate = input.encode('utf-8') if isinstance(input, unicode) else input
result = safe_url_string(_url_candidate)
self.assertEqual(result, self.url_safe)
def test_w3lib_to_unicode__fail3_expected(self):
fails = []
for (input_type, input) in self.inputs:
_url_candidate = input
result = w3lib_to_unicode(input)
if result != self.url_unicode:
fails.append((input_type, input, result))
if fails:
if False:
print("test_w3lib_to_unicode")
for fail in fails:
print(fail[0], fail[1])
pdb.set_trace()
raise ValueError("result != self.url_unicode: %s" % fails)
def test_w3lib_bypass_1__fail23_expected(self):
"""
in this example:
unicode b'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5'
string b'http://\xc3\xa2\xc2\x9e\xc2\xa1.ws/\xc3\xa2\xc2\x99\xc2\xa5'
url_bytes http://➡.ws/♥
"""
fails = []
for (input_type, input) in self.inputs:
_decoded = None
if isinstance(input, six.text_type):
_decoded = input.encode('utf-8')
elif isinstance(input, six.binary_type):
try:
_decoded = input.decode()
except UnicodeDecodeError as exc: # Py2 catch
_decoded = input
if _decoded != self.url_unicode:
fails.append((input_type, input, _decoded))
if fails:
if False:
print("test_w3lib_bypass_1")
for fail in fails:
print(fail[0], fail[1])
pdb.set_trace()
raise ValueError("result != self.url_unicode: %s" % fails)
def test_w3lib_bypass_2__fail23_expected(self):
"""
in this example:
unicode http://➡.ws/♥
string http://➡.ws/♥
url_bytes http://➡.ws/♥
"""
fails = []
for (input_type, input) in self.inputs:
_decoded = None
if isinstance(input, six.text_type):
_decoded = input.encode('utf-8')
elif isinstance(input, six.binary_type):
try:
_decoded = input.decode()
except UnicodeDecodeError as exc: # Py2 catch
_decoded = input
if _decoded != self.url_unicode:
fails.append((input_type, input, _decoded))
if fails:
if False:
print("test_w3lib_bypass_2")
for fail in fails:
print(fail[0], fail[1])
pdb.set_trace()
raise ValueError("result != self.url_unicode: %s" % fails)
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment