Created
July 18, 2013 06:04
-
-
Save ccwang002/6027023 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "Analyze_3UTR_entries" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "## This is Header ##\nThis is description" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "%%bash\nls *.csv", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "hg19_3UTR_raw.csv\nhg19_3UTR_refgene.csv\nhg19_3UTR_UCSC_filtered_201307.csv\nonly_r_NM_transcripts.csv\nucsc_gene_table.csv\nucsc_names.csv\n" | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "%%bash\nhead ucsc_gene_table.csv", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\"REFSEQ\",\"SYMBOL\",\"ENTREZID\"\n\"NM_032291\",\"SGIP1\",\"84251\"\n\"NM_032785\",\"AGBL4\",\"84871\"\n\"NM_018090\",\"NECAP2\",\"55707\"\n\"NM_052998\",\"ADC\",\"113451\"\n\"NM_001145278\",\"NECAP2\",\"55707\"\n\"NM_001080397\",\"SLC45A1\",\"50651\"\n\"NM_013943\",\"CLIC4\",\"25932\"\n\"NM_001145277\",\"NECAP2\",\"55707\"\n\"NM_001195684\",\"TGFBR3\",\"7049\"\n" | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from collections import namedtuple\nGene = namedtuple(\"Gene\", [\"refseq\", \"symbol\", \"id\"])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "Gene._fields", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 11, | |
"text": "('refseq', 'symbol', 'id')" | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "gene_list = []", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "import csv\nwith open('ucsc_gene_table.csv') as csv_f:\n reader = csv.reader(csv_f)\n next(reader) # skip the csv header\n for record in reader:\n gene_list.append(Gene(*record)) # unpack the readed list in order", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 20 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "len(gene_list)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 21, | |
"text": "40046" | |
} | |
], | |
"prompt_number": 21 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from collections import Counter", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 22 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "symbol_counter = Counter()\nfor gene in gene_list:\n # we put the gene symbol only in to counter\n # put the symbol in list to keep it from upacking\n symbol_counter.update([gene.symbol])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 26 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "symbol_counter.most_common(10)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 28, | |
"text": "[('NA', 1302),\n ('UTY', 77),\n ('MOG', 72),\n ('AGER', 70),\n ('NRM', 56),\n ('LST1', 48),\n ('BAG6', 42),\n ('C6orf25', 42),\n ('GPANK1', 35),\n ('LOC100507547', 32)]" | |
} | |
], | |
"prompt_number": 28 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "tx_counter = Counter()\nfor tx in gene_list:\n # this time count by Refseq ID\n tx_counter.update([tx.refseq])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 29 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "tx_counter.most_common(10)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 30, | |
"text": "[('NM_001127389', 19),\n ('NM_001127388', 19),\n ('NM_001127386', 19),\n ('NM_001164467', 19),\n ('NM_033178', 19),\n ('NM_001278056', 13),\n ('NM_001177376', 12),\n ('NM_001128590', 10),\n ('NM_032454', 10),\n ('NM_032470', 10)]" | |
} | |
], | |
"prompt_number": 30 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# store the top frequent transcripts\nfreq_tx_list = [tup[0] for tup in tx_counter.most_common(10)]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 32 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "freq_tx_list", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 33, | |
"text": "['NM_001127389',\n 'NM_001127388',\n 'NM_001127386',\n 'NM_001164467',\n 'NM_033178',\n 'NM_001278056',\n 'NM_001177376',\n 'NM_001128590',\n 'NM_032454',\n 'NM_032470']" | |
} | |
], | |
"prompt_number": 33 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# obtain back the full gene entry by creating a set,\n# which drops the duplicated entries automatically\n{gene for gene in gene_list if gene.refseq in freq_tx_list}", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 34, | |
"text": "set([Gene(refseq='NM_001127389', symbol='DUX4L5', id='653545'),\n Gene(refseq='NM_001164467', symbol='DUX4L3', id='653548'),\n Gene(refseq='NM_001177376', symbol='DUX4L4', id='441056'),\n Gene(refseq='NM_032454', symbol='STK19', id='8859'),\n Gene(refseq='NM_001278056', symbol='NA', id='NA'),\n Gene(refseq='NM_032470', symbol='TNXB', id='7148'),\n Gene(refseq='NM_001128590', symbol='CYP21A2', id='1589'),\n Gene(refseq='NM_001127386', symbol='DUX4L2', id='728410'),\n Gene(refseq='NM_001127388', symbol='DUX4L6', id='653544'),\n Gene(refseq='NM_033178', symbol='DUX4', id='22947')])" | |
} | |
], | |
"prompt_number": 34 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment