Skip to content

Instantly share code, notes, and snippets.

@jwdebelius
Created September 16, 2019 09:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jwdebelius/c4d7a9f7554555774669455b3f424078 to your computer and use it in GitHub Desktop.
Save jwdebelius/c4d7a9f7554555774669455b3f424078 to your computer and use it in GitHub Desktop.
A dirty script for tidying a silva taxonomy string into a Series
import pandas as pd
def tidy_taxon_silva(x):
"""
A very ugly script for cleaning taxonomy.
The script will take the string, and parse it into seven taxonomic levels
if they are avalaible. If lower levels are unavalaible (i.e. they could
not be classified accurately), then they will inheriet a designation
from the last classified level. Then, ambigious or uncultured organisms
will be described by inherieting the level above.
Parameters
----------
x : str
A string with text-delimited taxonomy seperated by a `";"` character.
This is assumed to be used with an apply function.
Returns
-------
pd.Series
The cleaned taxonomy as a Series specifying kingdom, phylum, class,
order, family, genus, species
"""
# Converts contexted descriptions ([taxa] means it's contested) into a
# string that identifies it as a taxonomic name
x = x.replace('[', 'cont. ').replace(']', '')
# Splits the data and then describes the data as unclassified if the
# level could not be classified.
splits = x.split(';')
if len(splits) == 7:
[d0, d1, d2, d3, d4, d5, d6] = splits
elif len(splits) == 6:
[d0, d1, d2, d3, d4, d5] = splits
d6 = d5.replace('5__', '6__unclassified g. ')
elif len(splits) == 5:
[d0, d1, d2, d3, d4] = splits
d5 = d4.replace('4__', '5__unclassified f. ')
d6 = d5.replace('_5', '_6')
elif len(splits) == 4:
[d0, d1, d2, d3] = splits
d4 = d3.replace('3__', '4__unclassified o. ')
d5 = d4.replace('_4', '_5')
d6 = d5.replace('_5', '_6')
# Identifies missing information at level 4 (family level)
if d4 == 'Ambiguous_taxa':
d4 = d3.replace('3__', '4__ambigious o. ')
d5 = d4.replace('D_4', 'D_5')
d6 = d4.replace('D_4', 'D_5')
elif d4 in {'D_4__uncultured', 'D_4__uncultured bacterium',
'D_4__uncultured organism',
'D_4__gut metagenome'}:
d4 = d3.replace('3__', '4__uncultured o. ')
d5 = d4.replace('D_4', 'D_5')
d6 = d5.replace('D_5', 'D_6')
elif 'D_4__Family' in d4:
d4 = '%s (%s)' % (d4, d3.split('__')[1])
# Idenifies missing or uncultured taxa at the genus level
if d5 == 'Ambiguous_taxa':
d5 = d4.replace('4__', '5__ambigious f. ')
d6 = d5.replace('D_5', 'D_6')
elif d5 in {'D_5__gut metagenome', 'D_5__metagenome',
'D_5__uncultured bacterium',
'D_5__uncultured rumen bacterium',
'D_5__uncultured uncultured',
'D_5__uncultured uncultured organism', 'D_5__uncultured',
'D_5__uncultured organism',
}:
d5 = d4.replace('4__', '5__uncultured f. ')
d6 = d5.replace('D_5', 'D_6')
# Idenifies missing or uncultured taxa at the species level
if d6 == 'Ambiguous_taxa':
d6 = d5.replace('5__', '6__ambigious g. ')
elif d6 == 'D_6__unidentified':
d6 = d5.replace('5__', '6__unidentified g. ')
elif d6 in {'D_6__uncultured bacterium', 'D_6__uncultured organism',
'D_6__uncultured prokaryote',
'D_6__uncultured rumen bacterium',
'D_6__uncultured uncultured',
'D_6__uncultured uncultured organism',
'D_6__uncultured spirochete',
'D_6__gut metagenome', 'D_6__human gut metagenome',
'D_6__metagenome',
}:
d6 = d5.replace('5__', '6__uncultured g. ')
# One more level of clean up
if 'D_5' in d6:
d6 = d6.replace('D_5', 'D_6')
elif 'D_4' in d6:
d6 = d6.replace('D_4', 'D_5')
if 'D_4' in d5:
d4 = d5.replace('D_4', 'D_5')
# Builds the series
return pd.Series({'kingdom': d0, 'phylum': d1, 'class': d2,
'order': d3, 'family': d4, 'genus': d5,
'species': d6,
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment