Created
February 9, 2016 18:07
-
-
Save jbarnoud/37a524330f29b5b7b096 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from __future__ import print_function, division\n", | |
"import gzip\n", | |
"import os\n", | |
"import collections" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## What is in the PDB?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"Atom = collections.namedtuple('Atom', 'resname name')\n", | |
"seen = collections.defaultdict(int)\n", | |
"seen_per_file = collections.defaultdict(int)\n", | |
"nfiles = 0\n", | |
"for root, dirs, files in os.walk('/coarse/jon/pdb'):\n", | |
" for entgz in (f for f in files if f.endswith('.ent.gz')):\n", | |
" nfiles += 1\n", | |
" seen_here = set()\n", | |
" path = os.path.join(root, entgz)\n", | |
" with gzip.open(path) as infile:\n", | |
" for line in infile:\n", | |
" record_type = line[:6]\n", | |
" if record_type in ('ATOM ', 'HETATM'):\n", | |
" atom_name = line[12:16]\n", | |
" res_name = line[17:20]\n", | |
" seen[Atom(res_name, atom_name)] += 1\n", | |
" seen_here.add(Atom(res_name, atom_name))\n", | |
" for pair in seen_here:\n", | |
" seen_per_file[pair] += 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Number of files: 114344\n", | |
"Number of pairs: 535430\n", | |
"Number of records: 904561253\n" | |
] | |
} | |
], | |
"source": [ | |
"print('Number of files:', nfiles)\n", | |
"print('Number of pairs:', len(seen))\n", | |
"print('Number of records:', sum(seen.itervalues()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"nfiles_threshold = 5\n", | |
"left_flush_all = {key: value for key, value in seen_per_file.items() if len(key[1].strip()) < 4 and key[1][0] != ' ' }\n", | |
"left_flush = {key: value for key, value in left_flush_all.items() if value > nfiles_threshold}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Number of problematic pairs: 3499 (0.65349345386%)\n", | |
"Number of problematic pairs in more than 5 files: 264\n", | |
"Number of problematic records: 78894\n" | |
] | |
} | |
], | |
"source": [ | |
"print('Number of problematic pairs:', len(left_flush_all),\n", | |
" '({}%)'.format(100 * len(left_flush_all) / len(seen)))\n", | |
"print('Number of problematic pairs in more than {} files:'.format(nfiles_threshold),\n", | |
" len(left_flush))\n", | |
"print('Number of problematic records:', sum(left_flush_all.itervalues()))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## How to guess the correct atom name?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"as_parsed = {atom: Atom(atom.resname.strip(), atom.name.strip()) for atom in seen}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"Validation = collections.namedtuple('Validation', ('false_negative', 'false_positive'))\n", | |
"\n", | |
"\n", | |
"def validate_guesser(guesser, atoms):\n", | |
" false_positive = []\n", | |
" false_negative = []\n", | |
" for expected, read in atoms.iteritems():\n", | |
" guessed_name = guesser(read)\n", | |
" if guessed_name != expected.name:\n", | |
" if expected.name[0] == ' ':\n", | |
" false_negative.append((expected, guessed_name, read))\n", | |
" else:\n", | |
" false_positive.append((expected, guessed_name, read))\n", | |
" return Validation(false_negative, false_positive)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def guess_atom_name(atom):\n", | |
" ions = ('FE', 'AS', 'ZN', 'MG', 'MN', 'CO', 'BR',\n", | |
" 'CU', 'TA', 'MO', 'AL', 'BE', 'SE', 'PT',\n", | |
" 'EU', 'NI', 'IR', 'RH', 'AU', 'GD', 'RU')\n", | |
" special_hg = ('CMH', 'EMC', 'MBO', 'MMC', 'HGB', 'BE7', 'PMB')\n", | |
" special_cl = ('0QE', 'CPT', 'DCE', 'EAA', 'IMN', 'OCZ', 'OMY', 'OMZ',\n", | |
" 'UN9', '1N1', '2T8', '393', '3MY', 'BMU', 'CLM', 'CP6',\n", | |
" 'DB8', 'DIF', 'EFZ', 'LUR', 'RDC', 'UCL', 'XMM', 'HLT',\n", | |
" 'IRE', 'LCP', 'PCI', 'VGH')\n", | |
" include_pairs = (Atom('OEC', 'CA1'), Atom('PLL', 'PD'), Atom('OEX', 'CA1'))\n", | |
" exclude_pairs = (Atom('C14', 'C14'), Atom('C15', 'C15'),\n", | |
" Atom('F9F', 'F9F'), Atom('OAN', 'OAN'),\n", | |
" Atom('BLM', 'NI'), Atom('BZG', 'CO'), Atom('BZG', 'NI'),\n", | |
" Atom('VNL', 'CO1'), Atom('VNL', 'CO2'),\n", | |
" Atom('PF5', 'FE1'), Atom('PF5', 'FE2'),\n", | |
" Atom('UNL', 'UNL'))\n", | |
" if len(atom.name) >= 4:\n", | |
" return atom.name[:4]\n", | |
" if len(atom.name) == 1:\n", | |
" return ' {} '.format(atom.name)\n", | |
" if ((atom.resname == atom.name\n", | |
" or atom.name[:2] in ions\n", | |
" or atom.name == 'UNK'\n", | |
" or (atom.resname in special_hg and atom.name[:2] == 'HG')\n", | |
" or (atom.resname in special_cl and atom.name[:2] == 'CL')\n", | |
" or Atom(atom.resname, atom.name) in include_pairs)\n", | |
" and Atom(atom.resname, atom.name) not in exclude_pairs):\n", | |
" return '{:<4}'.format(atom.name)\n", | |
" return ' {:<3}'.format(atom.name)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Here we run the validation for the `guess_atom_name` function defined above. The function returns a list of false positives (atom names we guessed were preceded by a space, but are not), and a list of false negative (atom names we guessed were not preceded by a space, but actually are). We run the validation on the whole PDB archive." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"validation = validate_guesser(guess_atom_name, as_parsed)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The guesser generates about 2000 false positives. These error would also be done with the current way MDAnalysis write PDB files as it precedes all atom names shorter than 4 characters with a space." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"2225" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(validation.false_positive)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"If we exclude the (residue name, atom name) pairs that appear in 5 files or less over the whole PDB, then there is no false positive left." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[]" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"[val for val in validation.false_positive if seen_per_file[val[0]] > nfiles_threshold]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The guesser generates 117 false negatives. The current version of MDAnalysis writes these pairs correctly." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"117" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(validation.false_negative)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"However, these 116 of these pairs appear each in 5 files or less over the whole PDB. The only pair that is more frequent is the ('UNX', 'UNK') pair that appears in 75 files over the PDB archive." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[((Atom(resname='UNX', name=' UNK'), 'UNK ', Atom(resname='UNX', name='UNK')),\n", | |
" 75)]" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"[(val, seen_per_file[val[0]]) for val in validation.false_negative if seen_per_file[val[0]] > nfiles_threshold]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"However, this ('UNX', 'UNK') pair appears without the initial space in 424 files (about 5 times more than the version with the initial space)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"424" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"seen_per_file[Atom('UNX', 'UNK ')]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Interesting. Looks like a basis for a decent atom type guesser.