Created
April 13, 2016 20:09
-
-
Save wasade/9704c2714ec9daa9fc78f43f283d00a6 to your computer and use it in GitHub Desktop.
AG samples not in "clean" metadata
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import biom" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" % Total % Received % Xferd Average Speed Time Time Time Current\n", | |
" Dload Upload Total Spent Left Speed\n", | |
"100 38.6M 100 38.6M 0 0 656k 0 0:01:00 0:01:00 --:--:-- 962k\n", | |
" % Total % Received % Xferd Average Speed Time Time Time Current\n", | |
" Dload Upload Total Spent Left Speed\n", | |
"100 35.8M 100 35.8M 0 0 946k 0 0:00:38 0:00:38 --:--:-- 910k\n" | |
] | |
} | |
], | |
"source": [ | |
"!curl -O ftp://ftp.microbio.me/AmericanGut/rounds-1-25/01-raw/metadata.txt\n", | |
"!curl -O ftp://ftp.microbio.me/AmericanGut/rounds-1-25/04-meta/ag-cleaned.txt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/Users/daniel/miniconda3/envs/qiime191/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2902: DtypeWarning: Columns (70) have mixed types. Specify dtype option on import or set low_memory=False.\n", | |
" interactivity=interactivity, compiler=compiler, result=result)\n" | |
] | |
} | |
], | |
"source": [ | |
"raw = pd.read_csv('metadata.txt', sep='\\t', dtype={'#SampleID': str}).set_index('#SampleID')\n", | |
"cln = pd.read_csv('ag-cleaned.txt', sep='\\t', dtype={'#SampleID': str}).set_index('#SampleID')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# from https://github.com/biocore/American-Gut/blob/master/americangut/util.py#L432\n", | |
"simple_matter_map = {\n", | |
" 'feces': 'FECAL',\n", | |
" 'sebum': 'SKIN',\n", | |
" 'tongue': 'ORAL',\n", | |
" 'skin': 'SKIN',\n", | |
" 'mouth': 'ORAL',\n", | |
" 'gingiva': 'ORAL',\n", | |
" 'gingival epithelium': 'ORAL',\n", | |
" 'nares': 'SKIN',\n", | |
" 'skin of hand': 'SKIN',\n", | |
" 'hand': 'SKIN',\n", | |
" 'skin of head': 'SKIN',\n", | |
" 'hand skin': 'SKIN',\n", | |
" 'throat': 'ORAL',\n", | |
" 'auricular region zone of skin': 'SKIN',\n", | |
" 'mucosa of tongue': 'ORAL',\n", | |
" 'mucosa of vagina': 'SKIN',\n", | |
" 'palatine tonsil': 'ORAL',\n", | |
" 'hard palate': 'ORAL',\n", | |
" 'saliva': 'ORAL',\n", | |
" 'stool': 'FECAL',\n", | |
" 'vagina': 'SKIN',\n", | |
" 'fossa': 'SKIN',\n", | |
" 'buccal mucosa': 'ORAL',\n", | |
" 'vaginal fornix': 'SKIN',\n", | |
" 'hair follicle': 'SKIN',\n", | |
" 'nostril': 'SKIN'\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"9218\n", | |
"8361\n", | |
"8361\n" | |
] | |
} | |
], | |
"source": [ | |
"print(len(raw.index))\n", | |
"print(len(cln.index))\n", | |
"print(len(set(raw.index) & set(cln.index)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"missing = raw.loc[set(raw.index) - set(cln.index)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"857" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(missing.index)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Counter({'Unknown': 833, 'UBERON:vaginal introitus': 19, 'UBERON:hair': 5})\n", | |
"Counter({'ENVO:sterile water': 833, 'ENVO:mucus': 19, 'ENVO:sebum': 5})\n" | |
] | |
} | |
], | |
"source": [ | |
"from collections import Counter\n", | |
"print(Counter(missing.BODY_SITE))\n", | |
"print(Counter(missing.ENV_MATTER))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment