Skip to content

Instantly share code, notes, and snippets.

@ccwang002
Created July 18, 2013 06:04
Show Gist options
  • Save ccwang002/6027023 to your computer and use it in GitHub Desktop.
Save ccwang002/6027023 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "Analyze_3UTR_entries"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": "## This is Header ##\nThis is description"
},
{
"cell_type": "code",
"collapsed": false,
"input": "%%bash\nls *.csv",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "hg19_3UTR_raw.csv\nhg19_3UTR_refgene.csv\nhg19_3UTR_UCSC_filtered_201307.csv\nonly_r_NM_transcripts.csv\nucsc_gene_table.csv\nucsc_names.csv\n"
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": "%%bash\nhead ucsc_gene_table.csv",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "\"REFSEQ\",\"SYMBOL\",\"ENTREZID\"\n\"NM_032291\",\"SGIP1\",\"84251\"\n\"NM_032785\",\"AGBL4\",\"84871\"\n\"NM_018090\",\"NECAP2\",\"55707\"\n\"NM_052998\",\"ADC\",\"113451\"\n\"NM_001145278\",\"NECAP2\",\"55707\"\n\"NM_001080397\",\"SLC45A1\",\"50651\"\n\"NM_013943\",\"CLIC4\",\"25932\"\n\"NM_001145277\",\"NECAP2\",\"55707\"\n\"NM_001195684\",\"TGFBR3\",\"7049\"\n"
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": "from collections import namedtuple\nGene = namedtuple(\"Gene\", [\"refseq\", \"symbol\", \"id\"])",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": "Gene._fields",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 11,
"text": "('refseq', 'symbol', 'id')"
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": "gene_list = []",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": "import csv\nwith open('ucsc_gene_table.csv') as csv_f:\n reader = csv.reader(csv_f)\n next(reader) # skip the csv header\n for record in reader:\n gene_list.append(Gene(*record)) # unpack the readed list in order",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 20
},
{
"cell_type": "code",
"collapsed": false,
"input": "len(gene_list)",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 21,
"text": "40046"
}
],
"prompt_number": 21
},
{
"cell_type": "code",
"collapsed": false,
"input": "from collections import Counter",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 22
},
{
"cell_type": "code",
"collapsed": false,
"input": "symbol_counter = Counter()\nfor gene in gene_list:\n # we put the gene symbol only in to counter\n # put the symbol in list to keep it from upacking\n symbol_counter.update([gene.symbol])",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 26
},
{
"cell_type": "code",
"collapsed": false,
"input": "symbol_counter.most_common(10)",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 28,
"text": "[('NA', 1302),\n ('UTY', 77),\n ('MOG', 72),\n ('AGER', 70),\n ('NRM', 56),\n ('LST1', 48),\n ('BAG6', 42),\n ('C6orf25', 42),\n ('GPANK1', 35),\n ('LOC100507547', 32)]"
}
],
"prompt_number": 28
},
{
"cell_type": "code",
"collapsed": false,
"input": "tx_counter = Counter()\nfor tx in gene_list:\n # this time count by Refseq ID\n tx_counter.update([tx.refseq])",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 29
},
{
"cell_type": "code",
"collapsed": false,
"input": "tx_counter.most_common(10)",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 30,
"text": "[('NM_001127389', 19),\n ('NM_001127388', 19),\n ('NM_001127386', 19),\n ('NM_001164467', 19),\n ('NM_033178', 19),\n ('NM_001278056', 13),\n ('NM_001177376', 12),\n ('NM_001128590', 10),\n ('NM_032454', 10),\n ('NM_032470', 10)]"
}
],
"prompt_number": 30
},
{
"cell_type": "code",
"collapsed": false,
"input": "# store the top frequent transcripts\nfreq_tx_list = [tup[0] for tup in tx_counter.most_common(10)]",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 32
},
{
"cell_type": "code",
"collapsed": false,
"input": "freq_tx_list",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 33,
"text": "['NM_001127389',\n 'NM_001127388',\n 'NM_001127386',\n 'NM_001164467',\n 'NM_033178',\n 'NM_001278056',\n 'NM_001177376',\n 'NM_001128590',\n 'NM_032454',\n 'NM_032470']"
}
],
"prompt_number": 33
},
{
"cell_type": "code",
"collapsed": false,
"input": "# obtain back the full gene entry by creating a set,\n# which drops the duplicated entries automatically\n{gene for gene in gene_list if gene.refseq in freq_tx_list}",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 34,
"text": "set([Gene(refseq='NM_001127389', symbol='DUX4L5', id='653545'),\n Gene(refseq='NM_001164467', symbol='DUX4L3', id='653548'),\n Gene(refseq='NM_001177376', symbol='DUX4L4', id='441056'),\n Gene(refseq='NM_032454', symbol='STK19', id='8859'),\n Gene(refseq='NM_001278056', symbol='NA', id='NA'),\n Gene(refseq='NM_032470', symbol='TNXB', id='7148'),\n Gene(refseq='NM_001128590', symbol='CYP21A2', id='1589'),\n Gene(refseq='NM_001127386', symbol='DUX4L2', id='728410'),\n Gene(refseq='NM_001127388', symbol='DUX4L6', id='653544'),\n Gene(refseq='NM_033178', symbol='DUX4', id='22947')])"
}
],
"prompt_number": 34
},
{
"cell_type": "code",
"collapsed": false,
"input": "",
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment