Skip to content

Instantly share code, notes, and snippets.

@gajomi
Created September 4, 2018 20:00
Show Gist options
  • Save gajomi/2ee5bcc1d9c01b5bb2657a01a15dc932 to your computer and use it in GitHub Desktop.
Save gajomi/2ee5bcc1d9c01b5bb2657a01a15dc932 to your computer and use it in GitHub Desktop.
Parse bionumbers DB dump into pandas dataframe with units
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from lxml import etree\n",
"import pandas as pd\n",
"import re\n",
"import pint"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"with open('BioNumbers.xls') as f:\n",
" rawstr = f.read()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"table = etree.HTML(rawstr).find(\"body/table\")\n",
"rows = iter(table)\n",
"headers = [col.text for col in next(rows)]\n",
"data = []\n",
"for row in rows:\n",
" values = [col.text for col in row]\n",
" data.append(dict(zip(headers, values)))\n",
"bionumbers = pd.DataFrame(data).applymap(str.lower)\n",
"\n",
"bionumbers = bionumbers.applymap(str.strip)\n",
"bionumbers['Units'] = bionumbers['Units'].str.replace('n/a','')\n",
"bionumbers['Units'] = bionumbers['Units'].str.replace(' per ','/')\n",
"bionumbers['Units'] = bionumbers['Units'].str.replace('µ|μ','u')\n",
"bionumbers['Units'] = bionumbers['Units'].str.replace('%','percent')\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# identify time and sequence units\n",
"timeunits = ['sec','min','hour','day','cycle','hz','hertz','flux']\n",
"def istimelikeunit(s):\n",
" return any(unit.lower() in timeunits for unit in re.split('\\W+',s))\n",
"\n",
"humankeywords = ['human','homo','sapiens','hela']\n",
"\n",
"def ishumanlike(s):\n",
" return any(word.lower() in humankeywords for word in re.split('\\W+',s))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"ureg = pint.UnitRegistry()\n",
"bionumbersunits = ['copy = [] = copies',\n",
" 'fraction = [] = frac',\n",
" 'percent = 1e-2 frac = pct = %',\n",
" 'unitless = dimensionless',\n",
" 'molecule = [] = molecules',\n",
" 'gene = []',\n",
" 'mutation = [] = mutations',\n",
" 'nucleotide = []',\n",
" 'chromosome = []',\n",
" 'ribosome = []',\n",
" 'protein = []',\n",
" 'polypeptide = []'\n",
" 'site = []',\n",
" 'basepair = [] = basepairs = bp = base pair = base pairs',\n",
" 'generation = []',\n",
" 'cell = [] = cells']\n",
"for unit in bionumbersunits:\n",
" ureg.define(unit)\n",
" \n",
"def tryunitparse(s):\n",
" try: \n",
" unit = ureg.parse_units(s)\n",
" return unit\n",
" except:\n",
" return s\n",
" \n",
"humanbionumbers = bionumbers[bionumbers['Organism'].apply(ishumanlike)]\n",
"timelikehumanbionumbers = humanbionumbers[humanbionumbers['Units'].apply(istimelikeunit)]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"humanbionumbers = bionumbers[bionumbers['Organism'].apply(ishumanlike)]\n",
"timelikehumanbionumbers = humanbionumbers[humanbionumbers['Units'].apply(istimelikeunit)]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Comments</th>\n",
" <th>Entered By</th>\n",
" <th>Keywords</th>\n",
" <th>Measurement Method</th>\n",
" <th>Organism</th>\n",
" <th>Primary Source</th>\n",
" <th>Primary Source PubMed ID</th>\n",
" <th>Properties</th>\n",
" <th>Range</th>\n",
" <th>Reference</th>\n",
" <th>Reference PubMed ID</th>\n",
" <th>Units</th>\n",
" <th>Value</th>\n",
" <th>bion_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>354</th>\n",
" <td>p.483 sidebar 12.4 1st paragraph: \"oxidation p...</td>\n",
" <td>paul jorgensen</td>\n",
" <td>oxidative hit</td>\n",
" <td></td>\n",
" <td>human homo sapiens</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>number of dna damage events caused by oxidativ...</td>\n",
" <td>'≤1000</td>\n",
" <td>r. weinberg, the biology of cancer, garland sc...</td>\n",
" <td></td>\n",
" <td>bases damaged/cell/day</td>\n",
" <td></td>\n",
" <td>100383</td>\n",
" </tr>\n",
" <tr>\n",
" <th>355</th>\n",
" <td>p.168 left column 4th paragraph: \"the mean val...</td>\n",
" <td>ron milo - admin</td>\n",
" <td>bence, biophysical parameters, diffusion, diff...</td>\n",
" <td>translational diffusion coefficient at 20degc ...</td>\n",
" <td>human homo sapiens</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>diffusion rate of bence jones rei</td>\n",
" <td>'table link - http://tinyurl.com/y76wo8ld</td>\n",
" <td>squire pg, himmel me. hydrodynamics and protei...</td>\n",
" <td>507801</td>\n",
" <td>um^2/sec</td>\n",
" <td>100</td>\n",
" <td>100384</td>\n",
" </tr>\n",
" <tr>\n",
" <th>436</th>\n",
" <td>p.168 left column 4th paragraph: \"the mean val...</td>\n",
" <td>ron milo - admin</td>\n",
" <td>diffusion coefficient, carbonate dehydratase</td>\n",
" <td>translational diffusion coefficient at 20degc ...</td>\n",
" <td>human homo sapiens</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>diffusion rate of carbonic anhydrase</td>\n",
" <td>'table link - http://tinyurl.com/y76wo8ld</td>\n",
" <td>squire pg, himmel me. hydrodynamics and protei...</td>\n",
" <td>507801</td>\n",
" <td>um^2/sec</td>\n",
" <td>107</td>\n",
" <td>100469</td>\n",
" </tr>\n",
" <tr>\n",
" <th>659</th>\n",
" <td>many useful brain facts and references at the ...</td>\n",
" <td>daniel ramot</td>\n",
" <td>brain&lt;img src=\"/wf_sql_xsrf.html\"&gt;&lt;img src=\"/w...</td>\n",
" <td></td>\n",
" <td>human homo sapiens</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>time until unconsciousness after loss of blood...</td>\n",
" <td>'8-10</td>\n",
" <td>http://faculty.washington.edu/chudler/facts.html</td>\n",
" <td></td>\n",
" <td>sec</td>\n",
" <td></td>\n",
" <td>100694</td>\n",
" </tr>\n",
" <tr>\n",
" <th>660</th>\n",
" <td>many useful brain facts and references at the ...</td>\n",
" <td>daniel ramot</td>\n",
" <td>brain&lt;img src=\"/wf_sql_xsrf.html\"&gt;&lt;img src=\"/w...</td>\n",
" <td></td>\n",
" <td>human homo sapiens</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>time until reflex loss after loss of blood sup...</td>\n",
" <td>'40-110</td>\n",
" <td>http://faculty.washington.edu/chudler/facts.html</td>\n",
" <td></td>\n",
" <td>sec</td>\n",
" <td></td>\n",
" <td>100695</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Comments Entered By \\\n",
"354 p.483 sidebar 12.4 1st paragraph: \"oxidation p... paul jorgensen \n",
"355 p.168 left column 4th paragraph: \"the mean val... ron milo - admin \n",
"436 p.168 left column 4th paragraph: \"the mean val... ron milo - admin \n",
"659 many useful brain facts and references at the ... daniel ramot \n",
"660 many useful brain facts and references at the ... daniel ramot \n",
"\n",
" Keywords \\\n",
"354 oxidative hit \n",
"355 bence, biophysical parameters, diffusion, diff... \n",
"436 diffusion coefficient, carbonate dehydratase \n",
"659 brain<img src=\"/wf_sql_xsrf.html\"><img src=\"/w... \n",
"660 brain<img src=\"/wf_sql_xsrf.html\"><img src=\"/w... \n",
"\n",
" Measurement Method Organism \\\n",
"354 human homo sapiens \n",
"355 translational diffusion coefficient at 20degc ... human homo sapiens \n",
"436 translational diffusion coefficient at 20degc ... human homo sapiens \n",
"659 human homo sapiens \n",
"660 human homo sapiens \n",
"\n",
" Primary Source Primary Source PubMed ID \\\n",
"354 \n",
"355 \n",
"436 \n",
"659 \n",
"660 \n",
"\n",
" Properties \\\n",
"354 number of dna damage events caused by oxidativ... \n",
"355 diffusion rate of bence jones rei \n",
"436 diffusion rate of carbonic anhydrase \n",
"659 time until unconsciousness after loss of blood... \n",
"660 time until reflex loss after loss of blood sup... \n",
"\n",
" Range \\\n",
"354 '≤1000 \n",
"355 'table link - http://tinyurl.com/y76wo8ld \n",
"436 'table link - http://tinyurl.com/y76wo8ld \n",
"659 '8-10 \n",
"660 '40-110 \n",
"\n",
" Reference Reference PubMed ID \\\n",
"354 r. weinberg, the biology of cancer, garland sc... \n",
"355 squire pg, himmel me. hydrodynamics and protei... 507801 \n",
"436 squire pg, himmel me. hydrodynamics and protei... 507801 \n",
"659 http://faculty.washington.edu/chudler/facts.html \n",
"660 http://faculty.washington.edu/chudler/facts.html \n",
"\n",
" Units Value bion_id \n",
"354 bases damaged/cell/day 100383 \n",
"355 um^2/sec 100 100384 \n",
"436 um^2/sec 107 100469 \n",
"659 sec 100694 \n",
"660 sec 100695 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"timelikehumanbionumbers.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment