Skip to content

Instantly share code, notes, and snippets.

@lehrblogger
Last active December 17, 2015 20:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lehrblogger/5668256 to your computer and use it in GitHub Desktop.
Save lehrblogger/5668256 to your computer and use it in GitHub Desktop.
unicode troubles in ejabberd_xmlrpc
Hi,
I'm having trouble using unicode characters in roster item nicknames set
from Python using ejabberd_xmlrpc. Those nicknames work fine if I set them
from the command line using ejabberdctl add_rosteritem, but if
use add_rosteritem via xmlrpc, then the command returns as if it
succeeded... but the roster item is somehow corrupt. ejabberd throws errors
if I try to view the data using get_roster or the web admin interface, and
the user with the corrupt roster is also unable to log in.
I've tested this pretty thoroughly and documented it in the comments here:
https://gist.github.com/lehrblogger/5668256/ - I'm happy to format a
version to send to the list, but I'm not sure it would be legible and I
wanted to keep this short. I'm using the 2.1.x branch of both
http://github.com/processone/ejabberd and
http://github.com/processone/ejabberd-contrib with a nearly-default
ejabberd.cfg file on a fresh VM.
I know *much* more about character encoding now than I did this afternoon,
but it's still possible I'm making a simple mistake somewhere. But, if not,
could there be a bug in the ejabberd_xmlrpc module? I've looked at the
source, but haven't had any success figuring out what to patch.
Thanks!
Best,
Steven
@lehrblogger
Copy link
Author

For example, lets take the character ╯. Python can handle this character just fine:

>>> u'╯'
u'\u256f'
>>> u'╯'.encode('utf-8')
'\xe2\x95\xaf'

@lehrblogger
Copy link
Author

And so can ejabberdctl:

$ sudo ejabberdctl add_rosteritem admin localhost ectl_user localhost ╯ ectl_group both
$ sudo ejabberdctl get_roster admin localhost
ectl_user@localhost ╯ both    none    ectl_group

If I look in the ejabberd.log, I can see that the character was represented as [226, 149, 175]:

=INFO REPORT==== 2013-05-29 05:55:34 ===
D(<0.878.0>:ejabberd_commands:314) : Executing command mod_admin_extra:add_rosteritem with Args=[
                                                                                    "admin",
                                                                                    "localhost",
                                                                                    "ectl_user",
                                                                                    "localhost",
                                                                                    [226,
                                                                                     149,
                                                                                     175],
                                                                                    "ectl_group",
                                                                                    "both"]

@lehrblogger
Copy link
Author

So far, so good. Before trying to do the same thing with Python via xmlrpc, let's make sure the non-unicode case works:

>>> import xmlrpclib
>>> def xmlrpc_command(command, data):
...     xmlrpc_server = xmlrpclib.ServerProxy('http://localhost:4560', encoding='utf-8')
...     fn = getattr(xmlrpc_server, command)
...     return fn(data)
... 
>>> def test_nick(nick):
...     return xmlrpc_command('add_rosteritem', {
...         'localuser': 'admin',
...         'localserver': 'localhost',
...         'user': 'xmlrpc_user',
...         'server': 'localhost',
...         'group': 'xmlrpc_group',
...         'nick': nick,
...         'subs': 'both'
...     })
...     
... 
>>> test_nick('no_unicode')
{'res': 0}
$ sudo ejabberdctl get_roster admin localhost
ectl_user@localhost ╯ both    none    ectl_group
xmlrpc_user@localhost   no_unicode  both    none    xmlrpc_group
=INFO REPORT==== 2013-05-29 06:02:14 ===
D(<0.871.0>:ejabberd_commands:314) : Executing command mod_admin_extra:add_rosteritem with Args=[
                                                                                  "admin",
                                                                                  "localhost",
                                                                                  "xmlrpc_user",
                                                                                  "localhost",
                                                                                  "no_unicode",
                                                                                  "xmlrpc_group",
                                                                                  "both"]

@lehrblogger
Copy link
Author

That works, so let's try the unicode character:

>>> test_nick(u'╯')
{'res': 0}
=INFO REPORT==== 2013-05-29 06:11:44 ===
D(<0.893.0>:ejabberd_commands:314) : Executing command mod_admin_extra:add_rosteritem with Args=[
                                                                                  "admin",
                                                                                  "localhost",
                                                                                  "xmlrpc_user",
                                                                                  "localhost",
                                                                                  [9583],
                                                                                  "xmlrpc_group",
                                                                                  "both"]
$ sudo ejabberdctl get_roster admin localhost 
Problem 'error badarg' occurred executing the command.
Stacktrace: [{io_lib,format,["~s",[[9583]]]},
             {ejabberd_ctl,'-format_result/2-fun-1-',1},
             {lists,map,2},
             {ejabberd_ctl,format_result,2},
             {ejabberd_ctl,'-format_result/2-fun-0-',2},
             {lists,map,2},
             {lists,map,2},
             {ejabberd_ctl,format_result,2}]

Uh oh, something is wrong. What is that [9583]? Wasn't our unicode character [226, 149, 175]?

Maybe that should be {io_lib,format,["~ts",[[9583]]]} (note the t)? But I think it's just failing to read the data here, and it's getting written incorrectly somewhere else.

@lehrblogger
Copy link
Author

If I run test_nick() with some non-unicode character, then get_roster works again. None of the following unicode versions work, though:

test_nick('╯')
test_nick(u'╯'.encode('utf-8'))
test_nick(unicode(u'╯'))

@lehrblogger
Copy link
Author

So, let's try something simpler, like the echothis command:

>>> xmlrpc_command('echothis', 'non_unicode')
'non_unicode'
>>> xmlrpc_command('echothis', u'╯')
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<stdin>", line 4, in xmlrpc_command
  File "/usr/lib/python2.7/xmlrpclib.py", line 1224, in __call__
    return self.__send(self.__name, args)
  File "/usr/lib/python2.7/xmlrpclib.py", line 1578, in __request
    verbose=self.__verbose
  File "/usr/lib/python2.7/xmlrpclib.py", line 1264, in request
    return self.single_request(host, handler, request_body, verbose)
  File "/usr/lib/python2.7/xmlrpclib.py", line 1312, in single_request
    response.msg,
xmlrpclib.ProtocolError: <ProtocolError for localhost:4560/RPC2: 500 Internal Server Error>

With the ejabberd.log error:

=ERROR REPORT==== 2013-05-29 07:04:00 ===
{xmlrpc_http,183,
             {xmlrpc_encode,payload,{response,[[9583]]},{bad_value,[9583]}}}

hmm, that's no good either... how do I fix this?

@lehrblogger
Copy link
Author

Thanks for reading this far! Let me know if I can provide any more info.

In the meantime, (╯°□°)╯︵ ┻━┻

:)

@arcusfelis
Copy link

{io_lib,format,["~ts",[[9583]]]}
That is correct. ~s is for latin1 only.

[9583] is a list of unicode code points.
[226,149,175] is a list of bytes.
They are a different forms of the same thing.

Wasn't our unicode character [226, 149, 175]?

You can see:

unicode:characters_to_binary([9583]).
<<226,149,175>>

@lehrblogger
Copy link
Author

@arcusfelis, thanks for explaining that! Otherwise I wouldn't have known about the unicode code points.

I've managed to trace the route of the issue to the list of decode functions in https://github.com/etnt/xmlrpc/blob/master/src/xmlrpc_decode.erl#L132-L160 and encode functions in https://github.com/etnt/xmlrpc/blob/master/src/xmlrpc_encode.erl#L83-L115. It seems there is not one that properly handles unicode code points, which is why I get a bad_value error from the ╯above.

I'm trying to figure out what decode and encode functions I can add to automatically convert the unicode code points into a list of bytes. I'll post again when I figure it out, but other suggestions/pointers are welcome.

@lehrblogger
Copy link
Author

Fixed! dashdash-chat/ejabberd@ffbfb59

┬─┬◡ノ(° -°ノ)

@lehrblogger
Copy link
Author

Another possible solutions, for future reference:

$ git remote -v
origin  git@github.com:lehrblogger/ejabberd-modules.git (fetch)
origin  git@github.com:lehrblogger/ejabberd-modules.git (push)
$ git checkout badlop_patch
Switched to branch 'badlop_patch'
$ git log -n 1 -p
commit 8628e984542d23ebb58162840429466ddc00e64e
Author: Steven Lehrburger <>
Date:   Fri May 31 11:11:25 2013 -0400

    Add ejabberd_xmlrpc unicode patch from badlop

diff --git a/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl b/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl
index 0e958a1..e7c1242 100644
--- a/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl
+++ b/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl
@@ -401,7 +401,10 @@ format_arg(Arg, integer)
     Arg;
 format_arg(Arg, string)
   when is_list(Arg) ->
-    Arg.
+    case io_lib:printable_unicode_list(Arg) of
+      true -> binary_to_list(unicode:characters_to_binary(Arg));
+      false -> Arg
+    end.


 %% -----------------------------
$ git checkout master
Switched to branch 'master'
$ git log -n 1 -p
commit 378d5051dd22800fc8e6d83299fa1a83daae7d41
Author: Steven Lehrburger <>
Date:   Wed May 29 11:49:59 2013 -0400

    Handle unicode in ejabberd_xmlrpc arguments

    https://gist.github.com/lehrblogger/5668256

    I think the underlying problem is actually in an erlang library, since the echothis command still can't handle unicode, but I think this is sufficient for me for now.

diff --git a/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl b/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl
index 0e958a1..3bd9dd3 100644
--- a/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl
+++ b/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl
@@ -401,7 +401,12 @@ format_arg(Arg, integer)
     Arg;
 format_arg(Arg, string)
   when is_list(Arg) ->
-    Arg.
+    case {io_lib:printable_list(Arg), io_lib:printable_unicode_list(Arg)} of
+    {false, true} ->
+        binary_to_list(unicode:characters_to_binary(Arg));
+    {_, _} ->
+        Arg
+    end.


 %% -----------------------------

dashdash-chat/ejabberd-modules@378d505
dashdash-chat/ejabberd-modules@8628e98

@lehrblogger
Copy link
Author

And my attempt to modify the xmlrpc library itself, in case I don't keep my fork of it:

$ git remote -v
origin  git@github.com:lehrblogger/xmlrpc.git (fetch)
origin  git@github.com:lehrblogger/xmlrpc.git (push)
$ git log -p -n 2
commit d9b027dd7df4230cd840dbde8cdeec5b959b3a9a
Author: Steven Lehrburger <>
Date:   Thu May 30 02:28:39 2013 -0400

    Fixing errors with unicode

diff --git a/src/xmlrpc_encode.erl b/src/xmlrpc_encode.erl
index 6535545..df4a41c 100644
--- a/src/xmlrpc_encode.erl
+++ b/src/xmlrpc_encode.erl
@@ -109,14 +109,14 @@ encode({base64, Base64}) ->
 %    end;
     ["<base64>", Base64, "</base64>"];
 encode(Unicode) when is_list(Unicode) ->
-    case io_lib:printable_unicode_list(Unicode) of
-        true -> 
-            Binary = binary_to_list(unicode:characters_to_binary(Unicode)),
-            case xmlrpc_util:is_string(Binary) of         
-                yes -> ["<string>", escape_string(Binary), "</string>"];
-                no -> {error, {bad_unicode_value, Binary}}   
-            end;
-        false ->  {error, {bad_unicode_value, Unicode}}
+    Unicode = case io_lib:printable_unicode_list(Unicode) of
+        true ->
+            binary_to_list(unicode:characters_to_binary(Unicode));
+        false -> Unicode
+    end,
+    case xmlrpc_util:is_string(Unicode) of       
+        yes -> ["<string>", escape_string(Unicode), "</string>"];
+        no -> {error, {binary_bad_unicode_value, Unicode}}   
     end;
 encode(Value) ->
     case xmlrpc_util:is_string(Value) of

commit 30150e712d48e8063af7db63cba48fd0864a51cb
Author: Steven Lehrburger <>
Date:   Thu May 30 01:22:44 2013 -0400

    Properly encode lists of unicode code points

diff --git a/src/xmlrpc_encode.erl b/src/xmlrpc_encode.erl
index 636cf15..6535545 100644
--- a/src/xmlrpc_encode.erl
+++ b/src/xmlrpc_encode.erl
@@ -108,6 +108,16 @@ encode({base64, Base64}) ->
 %      no -> {error, {bad_base64, Base64}}
 %    end;
     ["<base64>", Base64, "</base64>"];
+encode(Unicode) when is_list(Unicode) ->
+    case io_lib:printable_unicode_list(Unicode) of
+        true -> 
+            Binary = binary_to_list(unicode:characters_to_binary(Unicode)),
+            case xmlrpc_util:is_string(Binary) of         
+                yes -> ["<string>", escape_string(Binary), "</string>"];
+                no -> {error, {bad_unicode_value, Binary}}   
+            end;
+        false ->  {error, {bad_unicode_value, Unicode}}
+    end;
 encode(Value) ->
     case xmlrpc_util:is_string(Value) of
        yes -> ["<string>", escape_string(Value), "</string>"];

dashdash-chat/xmlrpc@d9b027d
dashdash-chat/xmlrpc@30150e7

@lehrblogger
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment