Skip to content

Instantly share code, notes, and snippets.

@apokalyptik
Last active December 25, 2015 19:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save apokalyptik/7030421 to your computer and use it in GitHub Desktop.
Save apokalyptik/7030421 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
import re
import unicodedata
import codecs
import urllib2
if 2 == len( u'\U0001f4a9' ):
length_ucs2 = len
else:
def length_ucs2( string ):
return len( string.encode( 'UTF-16LE' ) ) / 2
lm = re.compile("^[0-9a-fA-F]")
s=u""
#with open("./NamesList.txt") as f:
response = urllib2.urlopen('http://www.unicode.org/Public/5.2.0/ucd/NamesList.txt')
for line in response.read().split("\n"):
if not lm.match(line):
continue
if line.find("\t<") > -1:
continue
s = s + unicodedata.lookup(line.strip().split("\t")[1])
print "length_ucs2: %d" % length_ucs2(s)
print "len: %d" % len(s)
print "bytes: %d" % len(bytearray(s, 'utf-8'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment