jwdebelius/clean_silva_taxonomy.py

## clean_silva_taxonomy.py
import pandas as pd

def tidy_taxon_silva(x):
    """
    A very ugly script for cleaning taxonomy.

    The script will take the string, and parse it into seven taxonomic levels
    if they are avalaible. If lower levels are unavalaible (i.e. they could
    not be classified accurately), then they will inheriet a designation
    from the last classified level. Then, ambigious or uncultured organisms
    will be described by inherieting the level above.

    Parameters
    ----------
    x : str
        A string with text-delimited taxonomy seperated by a `";"` character.
        This is assumed to be used with an apply function.

    Returns
    -------
    pd.Series
        The cleaned taxonomy as a Series specifying kingdom, phylum, class,
        order, family, genus, species

    """
    # Converts contexted descriptions ([taxa] means it's contested) into a
    # string that identifies it as a taxonomic name
    x = x.replace('[', 'cont. ').replace(']', '')

    # Splits the data and then describes the data as unclassified if the
    # level could not be classified.
    splits = x.split(';')
    if len(splits) == 7:
        [d0, d1, d2, d3, d4, d5, d6] = splits
    elif len(splits) == 6:
        [d0, d1, d2, d3, d4, d5] = splits
        d6 = d5.replace('5__', '6__unclassified g. ')
    elif len(splits) == 5:
        [d0, d1, d2, d3, d4] = splits
        d5 = d4.replace('4__', '5__unclassified f. ')
        d6 = d5.replace('_5', '_6')
    elif len(splits) == 4:
        [d0, d1, d2, d3] = splits
        d4 = d3.replace('3__', '4__unclassified o. ')
        d5 = d4.replace('_4', '_5')
        d6 = d5.replace('_5', '_6')

    # Identifies missing information at level 4 (family level)
    if d4 == 'Ambiguous_taxa':
        d4 = d3.replace('3__', '4__ambigious o. ')
        d5 = d4.replace('D_4', 'D_5')
        d6 = d4.replace('D_4', 'D_5')
    elif d4 in {'D_4__uncultured', 'D_4__uncultured bacterium',
                'D_4__uncultured organism',
                'D_4__gut metagenome'}:
        d4 = d3.replace('3__', '4__uncultured o. ')
        d5 = d4.replace('D_4', 'D_5')
        d6 = d5.replace('D_5', 'D_6')
    elif 'D_4__Family' in d4:
        d4 = '%s (%s)' % (d4, d3.split('__')[1])

    # Idenifies missing or uncultured taxa at the genus level
    if d5 == 'Ambiguous_taxa':
        d5 = d4.replace('4__', '5__ambigious f. ')
        d6 = d5.replace('D_5', 'D_6')
    elif d5 in {'D_5__gut metagenome', 'D_5__metagenome',
                'D_5__uncultured bacterium',
                'D_5__uncultured rumen bacterium',
                'D_5__uncultured uncultured',
                'D_5__uncultured uncultured organism', 'D_5__uncultured',
                'D_5__uncultured organism',
                }:
        d5 = d4.replace('4__', '5__uncultured f. ')
        d6 = d5.replace('D_5', 'D_6')

    # Idenifies missing or uncultured taxa at the species level
    if d6 == 'Ambiguous_taxa':
        d6 = d5.replace('5__', '6__ambigious g. ')
    elif d6 == 'D_6__unidentified':
        d6 = d5.replace('5__', '6__unidentified g. ')
    elif d6 in {'D_6__uncultured bacterium', 'D_6__uncultured organism',
                'D_6__uncultured prokaryote',
                'D_6__uncultured rumen bacterium',
                'D_6__uncultured uncultured',
                'D_6__uncultured uncultured organism',
                'D_6__uncultured spirochete',
                'D_6__gut metagenome', 'D_6__human gut metagenome',
                'D_6__metagenome',
                }:
        d6 = d5.replace('5__', '6__uncultured g. ')

    # One more level of clean up
    if 'D_5' in d6:
        d6 = d6.replace('D_5', 'D_6')
    elif 'D_4' in d6:
        d6 = d6.replace('D_4', 'D_5')

    if 'D_4' in d5:
        d4 = d5.replace('D_4', 'D_5')

    # Builds the series
    return pd.Series({'kingdom': d0, 'phylum': d1, 'class': d2,
                      'order': d3, 'family': d4, 'genus': d5,
                      'species': d6,
                      })
	import pandas as pd

	def tidy_taxon_silva(x):
	"""
	A very ugly script for cleaning taxonomy.

	The script will take the string, and parse it into seven taxonomic levels
	if they are avalaible. If lower levels are unavalaible (i.e. they could
	not be classified accurately), then they will inheriet a designation
	from the last classified level. Then, ambigious or uncultured organisms
	will be described by inherieting the level above.

	Parameters
	----------
	x : str
	A string with text-delimited taxonomy seperated by a `";"` character.
	This is assumed to be used with an apply function.

	Returns
	-------
	pd.Series
	The cleaned taxonomy as a Series specifying kingdom, phylum, class,
	order, family, genus, species

	"""
	# Converts contexted descriptions ([taxa] means it's contested) into a
	# string that identifies it as a taxonomic name
	x = x.replace('[', 'cont. ').replace(']', '')

	# Splits the data and then describes the data as unclassified if the
	# level could not be classified.
	splits = x.split(';')
	if len(splits) == 7:
	[d0, d1, d2, d3, d4, d5, d6] = splits
	elif len(splits) == 6:
	[d0, d1, d2, d3, d4, d5] = splits
	d6 = d5.replace('5__', '6__unclassified g. ')
	elif len(splits) == 5:
	[d0, d1, d2, d3, d4] = splits
	d5 = d4.replace('4__', '5__unclassified f. ')
	d6 = d5.replace('_5', '_6')
	elif len(splits) == 4:
	[d0, d1, d2, d3] = splits
	d4 = d3.replace('3__', '4__unclassified o. ')
	d5 = d4.replace('_4', '_5')
	d6 = d5.replace('_5', '_6')

	# Identifies missing information at level 4 (family level)
	if d4 == 'Ambiguous_taxa':
	d4 = d3.replace('3__', '4__ambigious o. ')
	d5 = d4.replace('D_4', 'D_5')
	d6 = d4.replace('D_4', 'D_5')
	elif d4 in {'D_4__uncultured', 'D_4__uncultured bacterium',
	'D_4__uncultured organism',
	'D_4__gut metagenome'}:
	d4 = d3.replace('3__', '4__uncultured o. ')
	d5 = d4.replace('D_4', 'D_5')
	d6 = d5.replace('D_5', 'D_6')
	elif 'D_4__Family' in d4:
	d4 = '%s (%s)' % (d4, d3.split('__')[1])

	# Idenifies missing or uncultured taxa at the genus level
	if d5 == 'Ambiguous_taxa':
	d5 = d4.replace('4__', '5__ambigious f. ')
	d6 = d5.replace('D_5', 'D_6')
	elif d5 in {'D_5__gut metagenome', 'D_5__metagenome',
	'D_5__uncultured bacterium',
	'D_5__uncultured rumen bacterium',
	'D_5__uncultured uncultured',
	'D_5__uncultured uncultured organism', 'D_5__uncultured',
	'D_5__uncultured organism',
	}:
	d5 = d4.replace('4__', '5__uncultured f. ')
	d6 = d5.replace('D_5', 'D_6')

	# Idenifies missing or uncultured taxa at the species level
	if d6 == 'Ambiguous_taxa':
	d6 = d5.replace('5__', '6__ambigious g. ')
	elif d6 == 'D_6__unidentified':
	d6 = d5.replace('5__', '6__unidentified g. ')
	elif d6 in {'D_6__uncultured bacterium', 'D_6__uncultured organism',
	'D_6__uncultured prokaryote',
	'D_6__uncultured rumen bacterium',
	'D_6__uncultured uncultured',
	'D_6__uncultured uncultured organism',
	'D_6__uncultured spirochete',
	'D_6__gut metagenome', 'D_6__human gut metagenome',
	'D_6__metagenome',
	}:
	d6 = d5.replace('5__', '6__uncultured g. ')

	# One more level of clean up
	if 'D_5' in d6:
	d6 = d6.replace('D_5', 'D_6')
	elif 'D_4' in d6:
	d6 = d6.replace('D_4', 'D_5')

	if 'D_4' in d5:
	d4 = d5.replace('D_4', 'D_5')

	# Builds the series
	return pd.Series({'kingdom': d0, 'phylum': d1, 'class': d2,
	'order': d3, 'family': d4, 'genus': d5,
	'species': d6,
	})