Instantly share code, notes, and snippets.

Embed
What would you like to do?
the alphabet of all Twitter-safe characters
#! /usr/bin/env python3
import sys
import unicodedata
import urllib.request
# We want to use every possible code point we can. That means starting at 0 and
# going all the way up to 0x10ffff, the largest encodable value. Because
# Twitter does NFC Unicode normalization, we need to omit characters that don't
# have NFC_Quick_Check=Yes property. We also need to omit characters that
# Twitter might strip, as well as the surrogate characters, which aren't legal
# to encode.
# The file DerivedNormalizationProps.txt defines all the code points with
# NFC_Quick_Check values of No or Maybe. Parse these out, so that we can
# exclude them from the Twitter alphabet.
derived_normalization_props_url = \
"http://www.unicode.org/Public/8.0.0/ucd/DerivedNormalizationProps.txt"
props = urllib.request.urlopen(derived_normalization_props_url).read().decode()
non_quick_check_code_points = set()
for line in props.splitlines():
# Strip comments.
comment_start = line.find('#')
if comment_start != -1:
line = line[:comment_start]
# Skip unrelated lines.
if 'NFC_QC' not in line:
continue
# Parse out the code point or range of code points.
hex_points = line.split(';')[0].strip().split('..')
points = [int(point, 16) for point in hex_points]
# Add single code points, or every code point in the given range
# (inclusive).
if len(points) == 1:
non_quick_check_code_points.add(points[0])
else:
for i in range(points[0], points[1]+1):
non_quick_check_code_points.add(i)
bad_unicode_categories = {
"Cc", # control characters
"Cf", # format characters
"Cs", # surrogate characters
"Zl", # line separators
"Zp", # paragraph separators
"Zs", # space separators
}
# Iterate over all possible characters, and if they're NFC_Quick_Check=Yes and
# not in one of the bad categories, print them to stdout.
for i in range(0x110000):
if i in non_quick_check_code_points:
continue
c = chr(i)
if unicodedata.category(c) in bad_unicode_categories:
continue
sys.stdout.write(c)
This file has been truncated, but you can view the full file.
@oconnor663

This comment has been minimized.

Owner

oconnor663 commented Jul 19, 2016

I've checked this with Unicode 9.0.0, and I think there are no changes :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment