Created
September 16, 2019 09:35
-
-
Save jwdebelius/c4d7a9f7554555774669455b3f424078 to your computer and use it in GitHub Desktop.
A dirty script for tidying a silva taxonomy string into a Series
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def tidy_taxon_silva(x): | |
""" | |
A very ugly script for cleaning taxonomy. | |
The script will take the string, and parse it into seven taxonomic levels | |
if they are avalaible. If lower levels are unavalaible (i.e. they could | |
not be classified accurately), then they will inheriet a designation | |
from the last classified level. Then, ambigious or uncultured organisms | |
will be described by inherieting the level above. | |
Parameters | |
---------- | |
x : str | |
A string with text-delimited taxonomy seperated by a `";"` character. | |
This is assumed to be used with an apply function. | |
Returns | |
------- | |
pd.Series | |
The cleaned taxonomy as a Series specifying kingdom, phylum, class, | |
order, family, genus, species | |
""" | |
# Converts contexted descriptions ([taxa] means it's contested) into a | |
# string that identifies it as a taxonomic name | |
x = x.replace('[', 'cont. ').replace(']', '') | |
# Splits the data and then describes the data as unclassified if the | |
# level could not be classified. | |
splits = x.split(';') | |
if len(splits) == 7: | |
[d0, d1, d2, d3, d4, d5, d6] = splits | |
elif len(splits) == 6: | |
[d0, d1, d2, d3, d4, d5] = splits | |
d6 = d5.replace('5__', '6__unclassified g. ') | |
elif len(splits) == 5: | |
[d0, d1, d2, d3, d4] = splits | |
d5 = d4.replace('4__', '5__unclassified f. ') | |
d6 = d5.replace('_5', '_6') | |
elif len(splits) == 4: | |
[d0, d1, d2, d3] = splits | |
d4 = d3.replace('3__', '4__unclassified o. ') | |
d5 = d4.replace('_4', '_5') | |
d6 = d5.replace('_5', '_6') | |
# Identifies missing information at level 4 (family level) | |
if d4 == 'Ambiguous_taxa': | |
d4 = d3.replace('3__', '4__ambigious o. ') | |
d5 = d4.replace('D_4', 'D_5') | |
d6 = d4.replace('D_4', 'D_5') | |
elif d4 in {'D_4__uncultured', 'D_4__uncultured bacterium', | |
'D_4__uncultured organism', | |
'D_4__gut metagenome'}: | |
d4 = d3.replace('3__', '4__uncultured o. ') | |
d5 = d4.replace('D_4', 'D_5') | |
d6 = d5.replace('D_5', 'D_6') | |
elif 'D_4__Family' in d4: | |
d4 = '%s (%s)' % (d4, d3.split('__')[1]) | |
# Idenifies missing or uncultured taxa at the genus level | |
if d5 == 'Ambiguous_taxa': | |
d5 = d4.replace('4__', '5__ambigious f. ') | |
d6 = d5.replace('D_5', 'D_6') | |
elif d5 in {'D_5__gut metagenome', 'D_5__metagenome', | |
'D_5__uncultured bacterium', | |
'D_5__uncultured rumen bacterium', | |
'D_5__uncultured uncultured', | |
'D_5__uncultured uncultured organism', 'D_5__uncultured', | |
'D_5__uncultured organism', | |
}: | |
d5 = d4.replace('4__', '5__uncultured f. ') | |
d6 = d5.replace('D_5', 'D_6') | |
# Idenifies missing or uncultured taxa at the species level | |
if d6 == 'Ambiguous_taxa': | |
d6 = d5.replace('5__', '6__ambigious g. ') | |
elif d6 == 'D_6__unidentified': | |
d6 = d5.replace('5__', '6__unidentified g. ') | |
elif d6 in {'D_6__uncultured bacterium', 'D_6__uncultured organism', | |
'D_6__uncultured prokaryote', | |
'D_6__uncultured rumen bacterium', | |
'D_6__uncultured uncultured', | |
'D_6__uncultured uncultured organism', | |
'D_6__uncultured spirochete', | |
'D_6__gut metagenome', 'D_6__human gut metagenome', | |
'D_6__metagenome', | |
}: | |
d6 = d5.replace('5__', '6__uncultured g. ') | |
# One more level of clean up | |
if 'D_5' in d6: | |
d6 = d6.replace('D_5', 'D_6') | |
elif 'D_4' in d6: | |
d6 = d6.replace('D_4', 'D_5') | |
if 'D_4' in d5: | |
d4 = d5.replace('D_4', 'D_5') | |
# Builds the series | |
return pd.Series({'kingdom': d0, 'phylum': d1, 'class': d2, | |
'order': d3, 'family': d4, 'genus': d5, | |
'species': d6, | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment