Last active
July 19, 2016 21:43
-
-
Save oconnor663/e9c878161e7a63517747 to your computer and use it in GitHub Desktop.
the alphabet of all Twitter-safe characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python3 | |
| import sys | |
| import unicodedata | |
| import urllib.request | |
| # We want to use every possible code point we can. That means starting at 0 and | |
| # going all the way up to 0x10ffff, the largest encodable value. Because | |
| # Twitter does NFC Unicode normalization, we need to omit characters that don't | |
| # have NFC_Quick_Check=Yes property. We also need to omit characters that | |
| # Twitter might strip, as well as the surrogate characters, which aren't legal | |
| # to encode. | |
| # The file DerivedNormalizationProps.txt defines all the code points with | |
| # NFC_Quick_Check values of No or Maybe. Parse these out, so that we can | |
| # exclude them from the Twitter alphabet. | |
| derived_normalization_props_url = \ | |
| "http://www.unicode.org/Public/8.0.0/ucd/DerivedNormalizationProps.txt" | |
| props = urllib.request.urlopen(derived_normalization_props_url).read().decode() | |
| non_quick_check_code_points = set() | |
| for line in props.splitlines(): | |
| # Strip comments. | |
| comment_start = line.find('#') | |
| if comment_start != -1: | |
| line = line[:comment_start] | |
| # Skip unrelated lines. | |
| if 'NFC_QC' not in line: | |
| continue | |
| # Parse out the code point or range of code points. | |
| hex_points = line.split(';')[0].strip().split('..') | |
| points = [int(point, 16) for point in hex_points] | |
| # Add single code points, or every code point in the given range | |
| # (inclusive). | |
| if len(points) == 1: | |
| non_quick_check_code_points.add(points[0]) | |
| else: | |
| for i in range(points[0], points[1]+1): | |
| non_quick_check_code_points.add(i) | |
| bad_unicode_categories = { | |
| "Cc", # control characters | |
| "Cf", # format characters | |
| "Cs", # surrogate characters | |
| "Zl", # line separators | |
| "Zp", # paragraph separators | |
| "Zs", # space separators | |
| } | |
| # Iterate over all possible characters, and if they're NFC_Quick_Check=Yes and | |
| # not in one of the bad categories, print them to stdout. | |
| for i in range(0x110000): | |
| if i in non_quick_check_code_points: | |
| continue | |
| c = chr(i) | |
| if unicodedata.category(c) in bad_unicode_categories: | |
| continue | |
| sys.stdout.write(c) |
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I've checked this with Unicode 9.0.0, and I think there are no changes :)