Created
September 3, 2010 08:31
-
-
Save benui-dev/563610 to your computer and use it in GitHub Desktop.
MongoDB, Perl, UTF8: Trying to demonstrate the confusion when using encoding and MongoDB. Not clear what's going on yet.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# I admit I'm not really sure what's going on here | |
# UTF8 in Perl still confuses the hell out of me | |
# But i'm not sure what MongoDB is doing or trying to do either | |
# It seems that it's treating keys and values differently | |
# Namely, by encoding values and not encoding keys. | |
# Can anyone suggest what's I *Should* be doing? | |
# I want to store UTF8 data and get back UTF8 data | |
use strict; | |
use warnings; | |
use Data::Dumper; | |
use MongoDB; | |
use utf8; | |
my $conn = MongoDB::Connection->new(host => 'unixdeva11', port => 21337); | |
my $db = $conn->get_database('foo'); | |
my $coll = $db->get_collection('bar'); | |
my $upgrade = "\x{e8}\x{bf}\x{9b}\x{e5}\x{8f}\x{a3}"; | |
my $downgrade = "\x{e8}\x{bf}\x{9b}\x{e5}\x{8f}\x{a3}"; | |
my $encode = "\x{e8}\x{bf}\x{9b}\x{e5}\x{8f}\x{a3}"; | |
my $decode = "\x{e8}\x{bf}\x{9b}\x{e5}\x{8f}\x{a3}"; | |
# http://perldoc.perl.org/utf8.html | |
utf8::upgrade( $upgrade ); # convert internal representation from native to utf8 | |
utf8::downgrade( $downgrade ); # convert internal utf8 to octet in native | |
utf8::encode( $encode ); # ? | |
utf8::decode( $decode ); # ? | |
my %insert_data = ( | |
id => 'foo', | |
upgraded => { $upgrade => $upgrade }, | |
downgraded => { $downgrade => $downgrade }, | |
encoded => { $encode => $encode }, | |
#decoded => { $decode => $decode }, # This causes MongoDB to crash silently | |
); | |
print Dumper(\%insert_data); | |
# Prints: | |
# $VAR1 = { | |
# 'downgraded' => { | |
# '进口' => '进口' | |
# }, | |
# 'encoded' => { | |
# '��' => '��' | |
# }, | |
# 'upgraded' => { | |
# "\x{e8}\x{bf}\x{9b}\x{e5}\x{8f}\x{a3}" => "\x{e8}\x{bf}\x{9b}\x{e5}\x{8f}\x{a3}" | |
# }, | |
# 'id' => 'foo' | |
# }; | |
my $result = $coll->update( | |
{ id => 'foo' }, | |
{ %insert_data }, | |
{ upsert => 1 } # create if non-exist, update if exist | |
); | |
my $data = $coll->find_one( | |
{ id => 'foo' } | |
); | |
# Look at the mess that comes back :( | |
print Dumper($data); | |
# Prints: | |
#$VAR1 = { | |
# '_id' => bless( { | |
# 'value' => '4c80707ce7ed288eb37deabf' | |
# }, 'MongoDB::OID' ), | |
# 'downgraded' => { | |
# '进口' => "\x{8fdb}\x{53e3}" | |
# }, | |
# 'encoded' => { | |
# '��' => "\x{e8}\x{bf}\x{9b}\x{e5}\x{8f}\x{a3}" | |
# }, | |
# 'upgraded' => { | |
# '进口' => "\x{e8}\x{bf}\x{9b}\x{e5}\x{8f}\x{a3}" | |
# }, | |
# 'id' => 'foo' | |
# }; | |
# So it looks like Upgraded is the one we want? | |
# The rest are all funged up | |
foreach my $key ( keys %{ $data->{upgraded} } ) { | |
my $value = $data->{upgraded}->{$key}; | |
print "$key => $value\n"; | |
print Dumper($key); | |
print Dumper($value); | |
# 进口 => 进口 | |
# $VAR1 = '进口'; | |
# $VAR1 = "\x{e8}\x{bf}\x{9b}\x{e5}\x{8f}\x{a3}"; | |
# Have to downgrade the $value so it comes out OK. | |
# Otherwise it's like double-encoded or something??? | |
utf8::downgrade( $value ); | |
print "$key => $value\n"; | |
print Dumper($key); | |
print Dumper($value); | |
# 进口 => 进口 | |
# $VAR1 = '进口'; | |
# $VAR1 = '进口'; | |
} |
Wow, how did you come across this year-old Gist? Thanks for the feedback :D
I'll bear it in mind next time I'm in Perl land (this q. was when I was at my previous job).
Actually I was discussing this issue with Jay Shirley and he linked me here. Actually it seems that won't solve the whole problem, but at least there is a ticket submitted for it.
Thought you'd be amused to know, this 5 year old discussion on Mongo helped my solve a MySQL issue
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I think you need to add $MongoDB::BSON::utf8_flag_on = 1;