Created
March 16, 2016 20:27
-
-
Save ahwagner/a47419b6c5e96ecb301c to your computer and use it in GitHub Desktop.
MLL Status.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "import requests\nimport bs4\nimport csv", | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "url = 'http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE11877'\nres = requests.get(url)\nres.raise_for_status()", | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "soup = bs4.BeautifulSoup(res.text, \"html.parser\")\nelement = soup.find(text='Samples (207)')", | |
"execution_count": 6, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "rows = element.parent.findNextSibling().findAll('td')", | |
"execution_count": 7, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "with open('mll_status.tsv', 'w') as f:\n csv_writer = csv.writer(f, delimiter='\\t')\n i = 0\n for row in rows:\n try:\n url = 'http://www.ncbi.nlm.nih.gov' + row.find('a').attrs['href']\n except AttributeError:\n continue\n i += 1\n res = requests.get(url)\n res.raise_for_status()\n rowSoup = bs4.BeautifulSoup(res.text, \"html.parser\")\n characteristics = [x.next for x in rowSoup.find(text='Characteristics').parent.findNextSibling().findAll('br')]\n patient_id = rowSoup.find(text='Title').parent.findNextSibling().text\n mll_status = None\n for characteristic in characteristics:\n trait, status = characteristic.split(' : ', 1)\n if trait == 'MLL':\n mll_status = status\n break\n if mll_status is None:\n raise ValueError('MLL status not found in {0}!'.format(row.text))\n else:\n csv_writer.writerow((patient_id, mll_status))\n print('{0} samples processed'.format(i))", | |
"execution_count": 15, | |
"outputs": [ | |
{ | |
"text": "207 samples processed\n", | |
"output_type": "stream", | |
"name": "stdout" | |
} | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"mimetype": "text/x-python", | |
"nbconvert_exporter": "python", | |
"codemirror_mode": { | |
"version": 3, | |
"name": "ipython" | |
}, | |
"file_extension": ".py", | |
"name": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "MLL Status.ipynb", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment