Skip to content

Instantly share code, notes, and snippets.

@mtholder
Last active August 29, 2015 14:06
Show Gist options
  • Save mtholder/ac58ab1b3c6a962b9bdc to your computer and use it in GitHub Desktop.
Save mtholder/ac58ab1b3c6a962b9bdc to your computer and use it in GitHub Desktop.
Script to convert OTT names or uniqunames to newick labels
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
import sys
import os
import re
u'''
Unicode char lists found by running against ott2.8
Using the names column in taxonomy.tsv required allowing the
following unicode characters:
æ denoted: u'\xe6'
Πdenoted: u'\u0152'
Æ denoted: u'\xc6'
œ denoted: u'\u0153'
α denoted: u'\u03b1'
γ denoted: u'\u03b3'
β denoted: u'\u03b2'
ε denoted: u'\u03b5'
δ denoted: u'\u03b4'
× denoted: u'\xd7'
ø denoted: u'\xf8'
θ denoted: u'\u03b8'
λ denoted: u'\u03bb'
μ denoted: u'\u03bc'
ß denoted: u'\xdf'
and the following punctuation characters:
',.;:_+*=<>/?#&%[](){}
The uniqnames field of taxonomy.tsv did not require the
the addition of any more characters
The uniqnames column of the synonyms.tsv file required the additional
characters:
η denoted: u'\u03b7'
š denoted: u'\u0161'
Ä denoted: u'\xc4'
Ö denoted: u'\xd6'
á denoted: u'\xe1'
Š denoted: u'\u0160'
ã denoted: u'\xe3'
å denoted: u'\xe5'
ä denoted: u'\xe4'
ç denoted: u'\xe7'
é denoted: u'\xe9'
è denoted: u'\xe8'
ë denoted: u'\xeb'
ê denoted: u'\xea'
í denoted: u'\xed'
ì denoted: u'\xec'
ï denoted: u'\xef'
î denoted: u'\xee'
ñ denoted: u'\xf1'
ó denoted: u'\xf3'
ô denoted: u'\xf4'
ö denoted: u'\xf6'
ú denoted: u'\xfa'
ü denoted: u'\xfc'
ÿ denoted: u'\xff'
ž denoted: u'\u017e'
à denoted: u'\xe0'
and the punctuation:
`"@^
'''
_NAMES_UNICODE_ALLOWED = u'\xe6\u0152\xc6\u0153\u03b1\u03b3\u03b2\u03b5\u03b4\xd7\xf8\u03b8\u03bb\u03bc\xdf'
_SYN_UNIQUE_UNICODE_ALLOWED = u'\u03b7\u0161\xc4\xd6\xe1\u0160\xe3\xe5\xe4\xe7\xe9\xe8\xeb\xea\xed\xec\xef\xee\xf1\xf3\xf4\xf6\xfa\xfc\xff\u017e\xe0'
_UNICODE_ALLOWED = _NAMES_UNICODE_ALLOWED + _SYN_UNIQUE_UNICODE_ALLOWED
_SCRIPT_NAME = os.path.split(sys.argv[0])[-1]
_PUNC_STR = '\'"`,.;:_+*=<>/?@#%^&\[\](){}' # '`~@#$%^&*()_+={}|\\\[\]:;"\'<,>.?/' + _UNICODE_ALLOWED
_ALLOWED_STR = r'- 0-9a-zA-Z' + _PUNC_STR + _UNICODE_ALLOWED
_FORBIDDEN = re.compile(r'([^' + _ALLOWED_STR + '])', re.UNICODE)
_NEEDS_QU_PUNC_STR = r'[\[\]():,;]'
_NEEDS_QUOTES_PATTERN = re.compile(r'(\s|' + _NEEDS_QU_PUNC_STR + ')')
_NEEDS_QUOTES_PUNC_PATTERN = re.compile(_NEEDS_QU_PUNC_STR)
_SINGLE_QUOTE = "'"
_TWO_QUOTES = "''"
_FORBIDDEN_FOUND = set()
def print_quoted(word):
# reject labels any label with a character other than:
# spaces, Roman alphabet letters, and US keyboard punctuation
forbidden = _FORBIDDEN.search(word)
if forbidden:
c = forbidden.group(1)
_FORBIDDEN_FOUND.add(c)
msg = 'The input word "' + word + '" a forbidden character "' + c + '"\n'
sys.stderr.write(_SCRIPT_NAME + ': ' + msg)
return False
if _SINGLE_QUOTE in word:
# in newick and NEXUS a label with ' has to have
# ' on the outside and change every ' to '' (two single quotes)
split_on_quote = word.split(_SINGLE_QUOTE)
doubled_single_quotes = _TWO_QUOTES.join(split_on_quote)
print _SINGLE_QUOTE + doubled_single_quotes + _SINGLE_QUOTE
elif _NEEDS_QUOTES_PATTERN.search(word):
if not _NEEDS_QUOTES_PUNC_PATTERN.search(word):
# space is the only special char.
# we can use the _ for space trick that users prefer...
print '_'.join(word.split(' '))
else:
print _SINGLE_QUOTE + word + _SINGLE_QUOTE
else:
print word
return True
if __name__ == '__main__':
import codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
sys.stderr = codecs.getwriter('utf-8')(sys.stderr)
strip_newline = True
if len(sys.argv) > 1:
if len(sys.argv) == 3 and sys.argv[1] == '-f':
inp = codecs.open(sys.argv[2], 'rU', encoding='utf-8')
else:
inp = sys.argv[1:]
strip_newline = False
else:
inp = iter(sys.stdin)
n_failed = 0
for token in inp:
if strip_newline:
token = token[:-1]
if not print_quoted(token):
n_failed += 1
if n_failed > 0:
sys.stderr.write('{s}: {f} words rejected.\n'.format(s=_SCRIPT_NAME, f=n_failed))
for c in _FORBIDDEN_FOUND:
sys.stderr.write('forbidden character: "' + c + '" denoted: ' + repr(c) + '\n')
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment