Skip to content

Instantly share code, notes, and snippets.

@mattiasostmar
Last active January 8, 2016 21:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mattiasostmar/020ae01f35c976dcbead to your computer and use it in GitHub Desktop.
Save mattiasostmar/020ae01f35c976dcbead to your computer and use it in GitHub Desktop.
JSON-STAGGER UnicodeDecodeError

Working:

import requests
text = "Fรถrdomen har alltid sin rot i vardagslivet - Olof Palme ๐Ÿ™ˆ๐Ÿ™ˆ๐Ÿ™Œ๐Ÿ™†๐Ÿ‘ช๐Ÿ‘yรผรก๐ŸŽงร–ร…ร„รช"
r = requests.post("http://json-tagger.herokuapp.com/tag",data=dict(data=text))
r.json()

{'entities': [{'token_ids': ['tok:0:8', 'tok:0:9', 'tok:0:10'], 'word_form': 'Olof Palme ๐Ÿ™ˆ๐Ÿ™ˆ๐Ÿ™Œ๐Ÿ™†๐Ÿ‘ช๐Ÿ‘yรผรก๐ŸŽงร–ร…ร„รช'}], 'sentences': [[{'morph_feat': 'UTR|SIN|DEF|NOM', 'pos_tag': 'NN', 'token_id': 'tok:0:0', 'word_form': 'Fรถrdomen', 'word_index': '1'}, {'morph_feat': 'PRS|AKT', 'pos_tag': 'VB', 'token_id': 'tok:0:1', 'word_form': 'har', 'word_index': '2'}, {'morph_feat': None, 'pos_tag': 'AB', 'token_id': 'tok:0:2', 'word_form': 'alltid', 'word_index': '3'}, {'morph_feat': 'UTR|SIN|DEF', 'pos_tag': 'PS', 'token_id': 'tok:0:3', 'word_form': 'sin', 'word_index': '4'}, {'morph_feat': 'UTR|SIN|IND|NOM', 'pos_tag': 'NN', 'token_id': 'tok:0:4', 'word_form': 'rot', 'word_index': '5'}, {'morph_feat': None, 'pos_tag': 'PP', 'token_id': 'tok:0:5', 'word_form': 'i', 'word_index': '6'}, {'morph_feat': 'NEU|SIN|DEF|NOM', 'pos_tag': 'NN', 'token_id': 'tok:0:6', 'word_form': 'vardagslivet', 'word_index': '7'}, {'morph_feat': None, 'pos_tag': 'MID', 'token_id': 'tok:0:7', 'word_form': '-', 'word_index': '8'}, {'morph_feat': 'NOM', 'pos_tag': 'PM', 'token_id': 'tok:0:8', 'word_form': 'Olof', 'word_index': '9'}, {'morph_feat': 'NOM', 'pos_tag': 'PM', 'token_id': 'tok:0:9', 'word_form': 'Palme', 'word_index': '10'}, {'morph_feat': 'NOM', 'pos_tag': 'PM', 'token_id': 'tok:0:10', 'word_form': '๐Ÿ™ˆ๐Ÿ™ˆ๐Ÿ™Œ๐Ÿ™†๐Ÿ‘ช๐Ÿ‘yรผรก๐ŸŽงร–ร…ร„รช', 'word_index': '11'}]]}

Not working:

import requests
data = "Fรถrdomen har alltid sin rot i vardagslivet - Olof Palme ๐ŸŽ‰๐Ÿ™ˆ๐Ÿ™Œ๐Ÿ™†๐Ÿ‘ช๐Ÿ‘yรผรก๐ŸŽงร–ร…ร„รช".encode("utf-8")
r = requests.post("http://json-tagger.herokuapp.com/tag", data=data)

UnicodeDecodeError Traceback (most recent call last) in () 1 import requests 2 data = "Fรถrdomen har alltid sin rot i vardagslivet - Olof Palme ๐ŸŽ‰๐Ÿ™ˆ๐Ÿ™Œ๐Ÿ™†๐Ÿ‘ช๐Ÿ‘yรผรก๐ŸŽงร–ร…ร„รช".encode("utf-8") ----> 3 r = requests.post("http://json-tagger.herokuapp.com/tag", data=data)

//anaconda/lib/python3.4/site-packages/requests/api.py in post(url, data, json, **kwargs) 105 """ 106 --> 107 return request('post', url, data=data, json=json, **kwargs) 108 109

//anaconda/lib/python3.4/site-packages/requests/api.py in request(method, url, **kwargs) 51 # cases, and look like a memory leak in others. 52 with sessions.Session() as session: ---> 53 return session.request(method=method, url=url, **kwargs) 54 55

//anaconda/lib/python3.4/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 452 hooks = hooks, 453 ) --> 454 prep = self.prepare_request(req) 455 456 proxies = proxies or {}

//anaconda/lib/python3.4/site-packages/requests/sessions.py in prepare_request(self, request) 386 auth=merge_setting(auth, self.auth), 387 cookies=merged_cookies, --> 388 hooks=merge_hooks(request.hooks, self.hooks), 389 ) 390 return p

//anaconda/lib/python3.4/site-packages/requests/models.py in prepare(self, method, url, headers, files, data, params, auth, cookies, hooks, json) 294 self.prepare_headers(headers) 295 self.prepare_cookies(cookies) --> 296 self.prepare_body(data, files, json) 297 self.prepare_auth(auth, url) 298

//anaconda/lib/python3.4/site-packages/requests/models.py in prepare_body(self, data, files, json) 445 else: 446 if data: --> 447 body = self._encode_params(data) 448 if isinstance(data, basestring) or hasattr(data, 'read'): 449 content_type = None

//anaconda/lib/python3.4/site-packages/requests/models.py in _encode_params(data) 82 83 if isinstance(data, (str, bytes)): ---> 84 return to_native_string(data) 85 elif hasattr(data, 'read'): 86 return data

//anaconda/lib/python3.4/site-packages/requests/utils.py in to_native_string(string, encoding) 698 out = string.encode(encoding) 699 else: --> 700 out = string.decode(encoding) 701 702 return out

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 1: ordinal not in range(128)

"Fรถrdomen har alltid sin rot i vardagslivet - Olof Palme ๐ŸŽ‰๐Ÿ™ˆ๐Ÿ™Œ๐Ÿ™†๐Ÿ‘ช๐Ÿ‘yรผรก๐ŸŽงร–ร…ร„รช".encode("utf-8")

UnicodeDecodeError Traceback (most recent call last) in () 1 import requests ----> 2 r = requests.post("http://json-tagger.herokuapp.com/tag", data="Fรถrdomen har alltid sin rot i vardagslivet - Olof Palme ๐ŸŽ‰๐Ÿ™ˆ๐Ÿ™Œ๐Ÿ™†๐Ÿ‘ช๐Ÿ‘yรผรก๐ŸŽงร–ร…ร„รช".encode("utf-8"))

//anaconda/lib/python3.4/site-packages/requests/api.py in post(url, data, json, **kwargs) 105 """ 106 --> 107 return request('post', url, data=data, json=json, **kwargs) 108 109

//anaconda/lib/python3.4/site-packages/requests/api.py in request(method, url, **kwargs) 51 # cases, and look like a memory leak in others. 52 with sessions.Session() as session: ---> 53 return session.request(method=method, url=url, **kwargs) 54 55

//anaconda/lib/python3.4/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 452 hooks = hooks, 453 ) --> 454 prep = self.prepare_request(req) 455 456 proxies = proxies or {}

//anaconda/lib/python3.4/site-packages/requests/sessions.py in prepare_request(self, request) 386 auth=merge_setting(auth, self.auth), 387 cookies=merged_cookies, --> 388 hooks=merge_hooks(request.hooks, self.hooks), 389 ) 390 return p

//anaconda/lib/python3.4/site-packages/requests/models.py in prepare(self, method, url, headers, files, data, params, auth, cookies, hooks, json) 294 self.prepare_headers(headers) 295 self.prepare_cookies(cookies) --> 296 self.prepare_body(data, files, json) 297 self.prepare_auth(auth, url) 298

//anaconda/lib/python3.4/site-packages/requests/models.py in prepare_body(self, data, files, json) 445 else: 446 if data: --> 447 body = self._encode_params(data) 448 if isinstance(data, basestring) or hasattr(data, 'read'): 449 content_type = None

//anaconda/lib/python3.4/site-packages/requests/models.py in _encode_params(data) 82 83 if isinstance(data, (str, bytes)): ---> 84 return to_native_string(data) 85 elif hasattr(data, 'read'): 86 return data

//anaconda/lib/python3.4/site-packages/requests/utils.py in to_native_string(string, encoding) 698 out = string.encode(encoding) 699 else: --> 700 out = string.decode(encoding) 701 702 return out

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 1: ordinal not in range(128)

mosmbp:research mos$ curl http://json-tagger.herokuapp.com/tag -d "Fรถrdomen har alltid sin rot i vardagslivet - Olof Palme ๐ŸŽ‰๐Ÿ™ˆ๐Ÿ™Œ๐Ÿ™†๐Ÿ‘ช๐Ÿ‘yรผรก๐ŸŽงร–ร…ร„รช"

{"sentences":[[{"word_index":"1","word_form":"F\u00f6rdomen","pos_tag":"NN","morph_feat":"UTR|SIN|DEF|NOM","token_id":"tok:0:0"},{"word_index":"2","word_form":"har","pos_tag":"VB","morph_feat":"PRS|AKT","token_id":"tok:0:1"},{"word_index":"3","word_form":"alltid","pos_tag":"AB","morph_feat":null,"token_id":"tok:0:2"},{"word_index":"4","word_form":"sin","pos_tag":"PS","morph_feat":"UTR|SIN|DEF","token_id":"tok:0:3"},{"word_index":"5","word_form":"rot","pos_tag":"NN","morph_feat":"UTR|SIN|IND|NOM","token_id":"tok:0:4"},{"word_index":"6","word_form":"i","pos_tag":"PP","morph_feat":null,"token_id":"tok:0:5"},{"word_index":"7","word_form":"vardagslivet","pos_tag":"NN","morph_feat":"NEU|SIN|DEF|NOM","token_id":"tok:0:6"},{"word_index":"8","word_form":"-","pos_tag":"MID","morph_feat":null,"token_id":"tok:0:7"},{"word_index":"9","word_form":"Olof","pos_tag":"PM","morph_feat":"NOM","token_id":"tok:0:8"},{"word_index":"10","word_form":"Palme","pos_tag":"PM","morph_feat":"NOM","token_id":"tok:0:9"},{"word_index":"11","word_form":"\ud83c\udf89\ud83d\ude48\ud83d\ude4c\ud83d\ude46\ud83d\udc6a\ud83d\udc4fy\u00fc\u00e1\ud83c\udfa7\u00d6\u00c5\u00c4\u00ea","pos_tag":"PM","morph_feat":"NOM","token_id":"tok:0:10"}]],"entities":[{"word_form":"Olof Palme \ud83c\udf89\ud83d\ude48\ud83d\ude4c\ud83d\ude46\ud83d\udc6a\ud83d\udc4fy\u00fc\u00e1\ud83c\udfa7\u00d6\u00c5\u00c4\u00ea","token_ids":["tok:0:8","tok:0:9","tok:0:10"]}]}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment