mtholder/ott_names_to_newick.py

## ott_names_to_newick.py
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
import sys
import os
import re
u'''
Unicode char lists found by running against ott2.8
Using the names column in taxonomy.tsv required allowing the
following unicode characters:
    æ denoted: u'\xe6'
    Œ denoted: u'\u0152'
    Æ denoted: u'\xc6'
    œ denoted: u'\u0153'
    α denoted: u'\u03b1'
    γ denoted: u'\u03b3'
    β denoted: u'\u03b2'
    ε denoted: u'\u03b5'
    δ denoted: u'\u03b4'
    × denoted: u'\xd7'
    ø denoted: u'\xf8'
    θ denoted: u'\u03b8'
    λ denoted: u'\u03bb'
    μ denoted: u'\u03bc'
    ß denoted: u'\xdf'
and the following punctuation characters:
    ',.;:_+*=<>/?#&%[](){}
The uniqnames field of taxonomy.tsv did not require the
    the addition of any more characters
The uniqnames column of the synonyms.tsv file required the additional
    characters:
    η denoted: u'\u03b7'
    š denoted: u'\u0161'
    Ä denoted: u'\xc4'
    Ö denoted: u'\xd6'
    á denoted: u'\xe1'
    Š denoted: u'\u0160'
    ã denoted: u'\xe3'
    å denoted: u'\xe5'
    ä denoted: u'\xe4'
    ç denoted: u'\xe7'
    é denoted: u'\xe9'
    è denoted: u'\xe8'
    ë denoted: u'\xeb'
    ê denoted: u'\xea'
    í denoted: u'\xed'
    ì denoted: u'\xec'
    ï denoted: u'\xef'
    î denoted: u'\xee'
    ñ denoted: u'\xf1'
    ó denoted: u'\xf3'
    ô denoted: u'\xf4'
    ö denoted: u'\xf6'
    ú denoted: u'\xfa'
    ü denoted: u'\xfc'
    ÿ denoted: u'\xff'
    ž denoted: u'\u017e'
    à denoted: u'\xe0'
and the punctuation:
    `"@^
'''
_NAMES_UNICODE_ALLOWED = u'\xe6\u0152\xc6\u0153\u03b1\u03b3\u03b2\u03b5\u03b4\xd7\xf8\u03b8\u03bb\u03bc\xdf'
_SYN_UNIQUE_UNICODE_ALLOWED = u'\u03b7\u0161\xc4\xd6\xe1\u0160\xe3\xe5\xe4\xe7\xe9\xe8\xeb\xea\xed\xec\xef\xee\xf1\xf3\xf4\xf6\xfa\xfc\xff\u017e\xe0'
_UNICODE_ALLOWED = _NAMES_UNICODE_ALLOWED + _SYN_UNIQUE_UNICODE_ALLOWED
_SCRIPT_NAME = os.path.split(sys.argv[0])[-1]
_PUNC_STR = '\'"`,.;:_+*=<>/?@#%^&\[\](){}' # '`~@#$%^&*()_+={}|\\\[\]:;"\'<,>.?/' + _UNICODE_ALLOWED
_ALLOWED_STR = r'- 0-9a-zA-Z' + _PUNC_STR + _UNICODE_ALLOWED
_FORBIDDEN = re.compile(r'([^' + _ALLOWED_STR + '])', re.UNICODE)
_NEEDS_QU_PUNC_STR = r'[\[\]():,;]'
_NEEDS_QUOTES_PATTERN = re.compile(r'(\s|' + _NEEDS_QU_PUNC_STR + ')')
_NEEDS_QUOTES_PUNC_PATTERN = re.compile(_NEEDS_QU_PUNC_STR)
_SINGLE_QUOTE = "'"
_TWO_QUOTES = "''"
_FORBIDDEN_FOUND = set()
def print_quoted(word):
    # reject labels any label with a character other than:
    #   spaces, Roman alphabet letters, and US keyboard punctuation
    forbidden = _FORBIDDEN.search(word)
    if forbidden:
        c = forbidden.group(1)
        _FORBIDDEN_FOUND.add(c)
        msg = 'The input word "' + word + '" a forbidden character "' + c + '"\n'
        sys.stderr.write(_SCRIPT_NAME + ': ' + msg)
        return False
    if _SINGLE_QUOTE in word:
        # in newick and NEXUS a label with ' has to have
        #  ' on the outside and change every ' to '' (two single quotes)
        split_on_quote = word.split(_SINGLE_QUOTE)
        doubled_single_quotes = _TWO_QUOTES.join(split_on_quote)
        print _SINGLE_QUOTE + doubled_single_quotes + _SINGLE_QUOTE
    elif _NEEDS_QUOTES_PATTERN.search(word):
        if not _NEEDS_QUOTES_PUNC_PATTERN.search(word):
            # space is the only special char.
            #   we can use the _ for space trick that users prefer...
            print '_'.join(word.split(' '))
        else:
            print _SINGLE_QUOTE + word + _SINGLE_QUOTE
    else:
        print word
    return True
if __name__ == '__main__':
    import codecs
    sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
    sys.stderr = codecs.getwriter('utf-8')(sys.stderr)

    strip_newline = True
    if len(sys.argv) > 1:
        if len(sys.argv) == 3 and sys.argv[1] == '-f':
            inp = codecs.open(sys.argv[2], 'rU', encoding='utf-8')
        else:
            inp = sys.argv[1:]
            strip_newline = False
    else:
        inp = iter(sys.stdin)
    n_failed = 0
    for token in inp:
        if strip_newline:
            token = token[:-1]
        if not print_quoted(token):
            n_failed += 1
    if n_failed > 0:
        sys.stderr.write('{s}: {f} words rejected.\n'.format(s=_SCRIPT_NAME, f=n_failed))
        for c in _FORBIDDEN_FOUND:
            sys.stderr.write('forbidden character: "' + c + '" denoted: ' + repr(c) + '\n')
        sys.exit(1)
	#!/usr/bin/env python
	# vim: set fileencoding=utf-8 :
	import sys
	import os
	import re
	u'''
	Unicode char lists found by running against ott2.8
	Using the names column in taxonomy.tsv required allowing the
	following unicode characters:
	æ denoted: u'\xe6'
	Œ denoted: u'\u0152'
	Æ denoted: u'\xc6'
	œ denoted: u'\u0153'
	α denoted: u'\u03b1'
	γ denoted: u'\u03b3'
	β denoted: u'\u03b2'
	ε denoted: u'\u03b5'
	δ denoted: u'\u03b4'
	× denoted: u'\xd7'
	ø denoted: u'\xf8'
	θ denoted: u'\u03b8'
	λ denoted: u'\u03bb'
	μ denoted: u'\u03bc'
	ß denoted: u'\xdf'
	and the following punctuation characters:
	',.;:_+*=<>/?#&%[](){}
	The uniqnames field of taxonomy.tsv did not require the
	the addition of any more characters
	The uniqnames column of the synonyms.tsv file required the additional
	characters:
	η denoted: u'\u03b7'
	š denoted: u'\u0161'
	Ä denoted: u'\xc4'
	Ö denoted: u'\xd6'
	á denoted: u'\xe1'
	Š denoted: u'\u0160'
	ã denoted: u'\xe3'
	å denoted: u'\xe5'
	ä denoted: u'\xe4'
	ç denoted: u'\xe7'
	é denoted: u'\xe9'
	è denoted: u'\xe8'
	ë denoted: u'\xeb'
	ê denoted: u'\xea'
	í denoted: u'\xed'
	ì denoted: u'\xec'
	ï denoted: u'\xef'
	î denoted: u'\xee'
	ñ denoted: u'\xf1'
	ó denoted: u'\xf3'
	ô denoted: u'\xf4'
	ö denoted: u'\xf6'
	ú denoted: u'\xfa'
	ü denoted: u'\xfc'
	ÿ denoted: u'\xff'
	ž denoted: u'\u017e'
	à denoted: u'\xe0'
	and the punctuation:
	`"@^
	'''
	_NAMES_UNICODE_ALLOWED = u'\xe6\u0152\xc6\u0153\u03b1\u03b3\u03b2\u03b5\u03b4\xd7\xf8\u03b8\u03bb\u03bc\xdf'
	_SYN_UNIQUE_UNICODE_ALLOWED = u'\u03b7\u0161\xc4\xd6\xe1\u0160\xe3\xe5\xe4\xe7\xe9\xe8\xeb\xea\xed\xec\xef\xee\xf1\xf3\xf4\xf6\xfa\xfc\xff\u017e\xe0'
	_UNICODE_ALLOWED = _NAMES_UNICODE_ALLOWED + _SYN_UNIQUE_UNICODE_ALLOWED
	_SCRIPT_NAME = os.path.split(sys.argv[0])[-1]
	_PUNC_STR = '\'"`,.;:_+=<>/?@#%^&\[\](){}' # '`~@#$%^&()_+={}\|\\\[\]:;"\'<,>.?/' + _UNICODE_ALLOWED
	_ALLOWED_STR = r'- 0-9a-zA-Z' + _PUNC_STR + _UNICODE_ALLOWED
	_FORBIDDEN = re.compile(r'([^' + _ALLOWED_STR + '])', re.UNICODE)
	_NEEDS_QU_PUNC_STR = r'[\[\]():,;]'
	_NEEDS_QUOTES_PATTERN = re.compile(r'(\s\|' + _NEEDS_QU_PUNC_STR + ')')
	_NEEDS_QUOTES_PUNC_PATTERN = re.compile(_NEEDS_QU_PUNC_STR)
	_SINGLE_QUOTE = "'"
	_TWO_QUOTES = "''"
	_FORBIDDEN_FOUND = set()
	def print_quoted(word):
	# reject labels any label with a character other than:
	# spaces, Roman alphabet letters, and US keyboard punctuation
	forbidden = _FORBIDDEN.search(word)
	if forbidden:
	c = forbidden.group(1)
	_FORBIDDEN_FOUND.add(c)
	msg = 'The input word "' + word + '" a forbidden character "' + c + '"\n'
	sys.stderr.write(_SCRIPT_NAME + ': ' + msg)
	return False
	if _SINGLE_QUOTE in word:
	# in newick and NEXUS a label with ' has to have
	# ' on the outside and change every ' to '' (two single quotes)
	split_on_quote = word.split(_SINGLE_QUOTE)
	doubled_single_quotes = _TWO_QUOTES.join(split_on_quote)
	print _SINGLE_QUOTE + doubled_single_quotes + _SINGLE_QUOTE
	elif _NEEDS_QUOTES_PATTERN.search(word):
	if not _NEEDS_QUOTES_PUNC_PATTERN.search(word):
	# space is the only special char.
	# we can use the _ for space trick that users prefer...
	print '_'.join(word.split(' '))
	else:
	print _SINGLE_QUOTE + word + _SINGLE_QUOTE
	else:
	print word
	return True
	if __name__ == '__main__':
	import codecs
	sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
	sys.stderr = codecs.getwriter('utf-8')(sys.stderr)

	strip_newline = True
	if len(sys.argv) > 1:
	if len(sys.argv) == 3 and sys.argv[1] == '-f':
	inp = codecs.open(sys.argv[2], 'rU', encoding='utf-8')
	else:
	inp = sys.argv[1:]
	strip_newline = False
	else:
	inp = iter(sys.stdin)
	n_failed = 0
	for token in inp:
	if strip_newline:
	token = token[:-1]
	if not print_quoted(token):
	n_failed += 1
	if n_failed > 0:
	sys.stderr.write('{s}: {f} words rejected.\n'.format(s=_SCRIPT_NAME, f=n_failed))
	for c in _FORBIDDEN_FOUND:
	sys.stderr.write('forbidden character: "' + c + '" denoted: ' + repr(c) + '\n')
	sys.exit(1)