Last active
August 29, 2015 14:06
-
-
Save mtholder/ac58ab1b3c6a962b9bdc to your computer and use it in GitHub Desktop.
Script to convert OTT names or uniqunames to newick labels
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# vim: set fileencoding=utf-8 : | |
import sys | |
import os | |
import re | |
u''' | |
Unicode char lists found by running against ott2.8 | |
Using the names column in taxonomy.tsv required allowing the | |
following unicode characters: | |
æ denoted: u'\xe6' | |
Œ denoted: u'\u0152' | |
Æ denoted: u'\xc6' | |
œ denoted: u'\u0153' | |
α denoted: u'\u03b1' | |
γ denoted: u'\u03b3' | |
β denoted: u'\u03b2' | |
ε denoted: u'\u03b5' | |
δ denoted: u'\u03b4' | |
× denoted: u'\xd7' | |
ø denoted: u'\xf8' | |
θ denoted: u'\u03b8' | |
λ denoted: u'\u03bb' | |
μ denoted: u'\u03bc' | |
ß denoted: u'\xdf' | |
and the following punctuation characters: | |
',.;:_+*=<>/?#&%[](){} | |
The uniqnames field of taxonomy.tsv did not require the | |
the addition of any more characters | |
The uniqnames column of the synonyms.tsv file required the additional | |
characters: | |
η denoted: u'\u03b7' | |
š denoted: u'\u0161' | |
Ä denoted: u'\xc4' | |
Ö denoted: u'\xd6' | |
á denoted: u'\xe1' | |
Š denoted: u'\u0160' | |
ã denoted: u'\xe3' | |
å denoted: u'\xe5' | |
ä denoted: u'\xe4' | |
ç denoted: u'\xe7' | |
é denoted: u'\xe9' | |
è denoted: u'\xe8' | |
ë denoted: u'\xeb' | |
ê denoted: u'\xea' | |
í denoted: u'\xed' | |
ì denoted: u'\xec' | |
ï denoted: u'\xef' | |
î denoted: u'\xee' | |
ñ denoted: u'\xf1' | |
ó denoted: u'\xf3' | |
ô denoted: u'\xf4' | |
ö denoted: u'\xf6' | |
ú denoted: u'\xfa' | |
ü denoted: u'\xfc' | |
ÿ denoted: u'\xff' | |
ž denoted: u'\u017e' | |
à denoted: u'\xe0' | |
and the punctuation: | |
`"@^ | |
''' | |
_NAMES_UNICODE_ALLOWED = u'\xe6\u0152\xc6\u0153\u03b1\u03b3\u03b2\u03b5\u03b4\xd7\xf8\u03b8\u03bb\u03bc\xdf' | |
_SYN_UNIQUE_UNICODE_ALLOWED = u'\u03b7\u0161\xc4\xd6\xe1\u0160\xe3\xe5\xe4\xe7\xe9\xe8\xeb\xea\xed\xec\xef\xee\xf1\xf3\xf4\xf6\xfa\xfc\xff\u017e\xe0' | |
_UNICODE_ALLOWED = _NAMES_UNICODE_ALLOWED + _SYN_UNIQUE_UNICODE_ALLOWED | |
_SCRIPT_NAME = os.path.split(sys.argv[0])[-1] | |
_PUNC_STR = '\'"`,.;:_+*=<>/?@#%^&\[\](){}' # '`~@#$%^&*()_+={}|\\\[\]:;"\'<,>.?/' + _UNICODE_ALLOWED | |
_ALLOWED_STR = r'- 0-9a-zA-Z' + _PUNC_STR + _UNICODE_ALLOWED | |
_FORBIDDEN = re.compile(r'([^' + _ALLOWED_STR + '])', re.UNICODE) | |
_NEEDS_QU_PUNC_STR = r'[\[\]():,;]' | |
_NEEDS_QUOTES_PATTERN = re.compile(r'(\s|' + _NEEDS_QU_PUNC_STR + ')') | |
_NEEDS_QUOTES_PUNC_PATTERN = re.compile(_NEEDS_QU_PUNC_STR) | |
_SINGLE_QUOTE = "'" | |
_TWO_QUOTES = "''" | |
_FORBIDDEN_FOUND = set() | |
def print_quoted(word): | |
# reject labels any label with a character other than: | |
# spaces, Roman alphabet letters, and US keyboard punctuation | |
forbidden = _FORBIDDEN.search(word) | |
if forbidden: | |
c = forbidden.group(1) | |
_FORBIDDEN_FOUND.add(c) | |
msg = 'The input word "' + word + '" a forbidden character "' + c + '"\n' | |
sys.stderr.write(_SCRIPT_NAME + ': ' + msg) | |
return False | |
if _SINGLE_QUOTE in word: | |
# in newick and NEXUS a label with ' has to have | |
# ' on the outside and change every ' to '' (two single quotes) | |
split_on_quote = word.split(_SINGLE_QUOTE) | |
doubled_single_quotes = _TWO_QUOTES.join(split_on_quote) | |
print _SINGLE_QUOTE + doubled_single_quotes + _SINGLE_QUOTE | |
elif _NEEDS_QUOTES_PATTERN.search(word): | |
if not _NEEDS_QUOTES_PUNC_PATTERN.search(word): | |
# space is the only special char. | |
# we can use the _ for space trick that users prefer... | |
print '_'.join(word.split(' ')) | |
else: | |
print _SINGLE_QUOTE + word + _SINGLE_QUOTE | |
else: | |
print word | |
return True | |
if __name__ == '__main__': | |
import codecs | |
sys.stdout = codecs.getwriter('utf-8')(sys.stdout) | |
sys.stderr = codecs.getwriter('utf-8')(sys.stderr) | |
strip_newline = True | |
if len(sys.argv) > 1: | |
if len(sys.argv) == 3 and sys.argv[1] == '-f': | |
inp = codecs.open(sys.argv[2], 'rU', encoding='utf-8') | |
else: | |
inp = sys.argv[1:] | |
strip_newline = False | |
else: | |
inp = iter(sys.stdin) | |
n_failed = 0 | |
for token in inp: | |
if strip_newline: | |
token = token[:-1] | |
if not print_quoted(token): | |
n_failed += 1 | |
if n_failed > 0: | |
sys.stderr.write('{s}: {f} words rejected.\n'.format(s=_SCRIPT_NAME, f=n_failed)) | |
for c in _FORBIDDEN_FOUND: | |
sys.stderr.write('forbidden character: "' + c + '" denoted: ' + repr(c) + '\n') | |
sys.exit(1) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment