-
-
Save lehrblogger/5668256 to your computer and use it in GitHub Desktop.
Hi, | |
I'm having trouble using unicode characters in roster item nicknames set | |
from Python using ejabberd_xmlrpc. Those nicknames work fine if I set them | |
from the command line using ejabberdctl add_rosteritem, but if | |
use add_rosteritem via xmlrpc, then the command returns as if it | |
succeeded... but the roster item is somehow corrupt. ejabberd throws errors | |
if I try to view the data using get_roster or the web admin interface, and | |
the user with the corrupt roster is also unable to log in. | |
I've tested this pretty thoroughly and documented it in the comments here: | |
https://gist.github.com/lehrblogger/5668256/ - I'm happy to format a | |
version to send to the list, but I'm not sure it would be legible and I | |
wanted to keep this short. I'm using the 2.1.x branch of both | |
http://github.com/processone/ejabberd and | |
http://github.com/processone/ejabberd-contrib with a nearly-default | |
ejabberd.cfg file on a fresh VM. | |
I know *much* more about character encoding now than I did this afternoon, | |
but it's still possible I'm making a simple mistake somewhere. But, if not, | |
could there be a bug in the ejabberd_xmlrpc module? I've looked at the | |
source, but haven't had any success figuring out what to patch. | |
Thanks! | |
Best, | |
Steven |
@arcusfelis, thanks for explaining that! Otherwise I wouldn't have known about the unicode code points.
I've managed to trace the route of the issue to the list of decode
functions in https://github.com/etnt/xmlrpc/blob/master/src/xmlrpc_decode.erl#L132-L160 and encode
functions in https://github.com/etnt/xmlrpc/blob/master/src/xmlrpc_encode.erl#L83-L115. It seems there is not one that properly handles unicode code points, which is why I get a bad_value error from the ╯above.
I'm trying to figure out what decode
and encode
functions I can add to automatically convert the unicode code points into a list of bytes. I'll post again when I figure it out, but other suggestions/pointers are welcome.
Fixed! dashdash-chat/ejabberd@ffbfb59
┬─┬◡ノ(° -°ノ)
Another possible solutions, for future reference:
$ git remote -v
origin git@github.com:lehrblogger/ejabberd-modules.git (fetch)
origin git@github.com:lehrblogger/ejabberd-modules.git (push)
$ git checkout badlop_patch
Switched to branch 'badlop_patch'
$ git log -n 1 -p
commit 8628e984542d23ebb58162840429466ddc00e64e
Author: Steven Lehrburger <>
Date: Fri May 31 11:11:25 2013 -0400
Add ejabberd_xmlrpc unicode patch from badlop
diff --git a/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl b/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl
index 0e958a1..e7c1242 100644
--- a/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl
+++ b/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl
@@ -401,7 +401,10 @@ format_arg(Arg, integer)
Arg;
format_arg(Arg, string)
when is_list(Arg) ->
- Arg.
+ case io_lib:printable_unicode_list(Arg) of
+ true -> binary_to_list(unicode:characters_to_binary(Arg));
+ false -> Arg
+ end.
%% -----------------------------
$ git checkout master
Switched to branch 'master'
$ git log -n 1 -p
commit 378d5051dd22800fc8e6d83299fa1a83daae7d41
Author: Steven Lehrburger <>
Date: Wed May 29 11:49:59 2013 -0400
Handle unicode in ejabberd_xmlrpc arguments
https://gist.github.com/lehrblogger/5668256
I think the underlying problem is actually in an erlang library, since the echothis command still can't handle unicode, but I think this is sufficient for me for now.
diff --git a/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl b/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl
index 0e958a1..3bd9dd3 100644
--- a/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl
+++ b/ejabberd_xmlrpc/trunk/src/ejabberd_xmlrpc.erl
@@ -401,7 +401,12 @@ format_arg(Arg, integer)
Arg;
format_arg(Arg, string)
when is_list(Arg) ->
- Arg.
+ case {io_lib:printable_list(Arg), io_lib:printable_unicode_list(Arg)} of
+ {false, true} ->
+ binary_to_list(unicode:characters_to_binary(Arg));
+ {_, _} ->
+ Arg
+ end.
%% -----------------------------
dashdash-chat/ejabberd-modules@378d505
dashdash-chat/ejabberd-modules@8628e98
And my attempt to modify the xmlrpc library itself, in case I don't keep my fork of it:
$ git remote -v
origin git@github.com:lehrblogger/xmlrpc.git (fetch)
origin git@github.com:lehrblogger/xmlrpc.git (push)
$ git log -p -n 2
commit d9b027dd7df4230cd840dbde8cdeec5b959b3a9a
Author: Steven Lehrburger <>
Date: Thu May 30 02:28:39 2013 -0400
Fixing errors with unicode
diff --git a/src/xmlrpc_encode.erl b/src/xmlrpc_encode.erl
index 6535545..df4a41c 100644
--- a/src/xmlrpc_encode.erl
+++ b/src/xmlrpc_encode.erl
@@ -109,14 +109,14 @@ encode({base64, Base64}) ->
% end;
["<base64>", Base64, "</base64>"];
encode(Unicode) when is_list(Unicode) ->
- case io_lib:printable_unicode_list(Unicode) of
- true ->
- Binary = binary_to_list(unicode:characters_to_binary(Unicode)),
- case xmlrpc_util:is_string(Binary) of
- yes -> ["<string>", escape_string(Binary), "</string>"];
- no -> {error, {bad_unicode_value, Binary}}
- end;
- false -> {error, {bad_unicode_value, Unicode}}
+ Unicode = case io_lib:printable_unicode_list(Unicode) of
+ true ->
+ binary_to_list(unicode:characters_to_binary(Unicode));
+ false -> Unicode
+ end,
+ case xmlrpc_util:is_string(Unicode) of
+ yes -> ["<string>", escape_string(Unicode), "</string>"];
+ no -> {error, {binary_bad_unicode_value, Unicode}}
end;
encode(Value) ->
case xmlrpc_util:is_string(Value) of
commit 30150e712d48e8063af7db63cba48fd0864a51cb
Author: Steven Lehrburger <>
Date: Thu May 30 01:22:44 2013 -0400
Properly encode lists of unicode code points
diff --git a/src/xmlrpc_encode.erl b/src/xmlrpc_encode.erl
index 636cf15..6535545 100644
--- a/src/xmlrpc_encode.erl
+++ b/src/xmlrpc_encode.erl
@@ -108,6 +108,16 @@ encode({base64, Base64}) ->
% no -> {error, {bad_base64, Base64}}
% end;
["<base64>", Base64, "</base64>"];
+encode(Unicode) when is_list(Unicode) ->
+ case io_lib:printable_unicode_list(Unicode) of
+ true ->
+ Binary = binary_to_list(unicode:characters_to_binary(Unicode)),
+ case xmlrpc_util:is_string(Binary) of
+ yes -> ["<string>", escape_string(Binary), "</string>"];
+ no -> {error, {bad_unicode_value, Binary}}
+ end;
+ false -> {error, {bad_unicode_value, Unicode}}
+ end;
encode(Value) ->
case xmlrpc_util:is_string(Value) of
yes -> ["<string>", escape_string(Value), "</string>"];
And the conversations with badlop in the ejabberd room about the fix:
http://chatlogs.jabber.ru/ejabberd@conference.jabber.ru/2013/05/30.html
http://chatlogs.jabber.ru/ejabberd@conference.jabber.ru/2013/05/31.html
{io_lib,format,["~ts",[[9583]]]}
That is correct.
~s
is for latin1 only.[9583]
is a list of unicode code points.[226,149,175]
is a list of bytes.They are a different forms of the same thing.
You can see: