jbarnoud/PDB atom names.ipynb

## PDB atom names.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from __future__ import print_function, division\n",
    "import gzip\n",
    "import os\n",
    "import collections"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## What is in the PDB?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "Atom = collections.namedtuple('Atom', 'resname name')\n",
    "seen = collections.defaultdict(int)\n",
    "seen_per_file = collections.defaultdict(int)\n",
    "nfiles = 0\n",
    "for root, dirs, files in os.walk('/coarse/jon/pdb'):\n",
    "    for entgz in (f for f in files if f.endswith('.ent.gz')):\n",
    "        nfiles += 1\n",
    "        seen_here = set()\n",
    "        path = os.path.join(root, entgz)\n",
    "        with gzip.open(path) as infile:\n",
    "            for line in infile:\n",
    "                record_type = line[:6]\n",
    "                if record_type in ('ATOM  ', 'HETATM'):\n",
    "                    atom_name = line[12:16]\n",
    "                    res_name = line[17:20]\n",
    "                    seen[Atom(res_name, atom_name)] += 1\n",
    "                    seen_here.add(Atom(res_name, atom_name))\n",
    "        for pair in seen_here:\n",
    "            seen_per_file[pair] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of files: 114344\n",
      "Number of pairs: 535430\n",
      "Number of records: 904561253\n"
     ]
    }
   ],
   "source": [
    "print('Number of files:', nfiles)\n",
    "print('Number of pairs:', len(seen))\n",
    "print('Number of records:', sum(seen.itervalues()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nfiles_threshold = 5\n",
    "left_flush_all = {key: value for key, value in seen_per_file.items() if len(key[1].strip()) < 4 and key[1][0] != ' ' }\n",
    "left_flush = {key: value for key, value in left_flush_all.items() if value > nfiles_threshold}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of problematic pairs: 3499 (0.65349345386%)\n",
      "Number of problematic pairs in more than 5 files: 264\n",
      "Number of problematic records: 78894\n"
     ]
    }
   ],
   "source": [
    "print('Number of problematic pairs:', len(left_flush_all),\n",
    "      '({}%)'.format(100 * len(left_flush_all) / len(seen)))\n",
    "print('Number of problematic pairs in more than {} files:'.format(nfiles_threshold),\n",
    "      len(left_flush))\n",
    "print('Number of problematic records:', sum(left_flush_all.itervalues()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How to guess the correct atom name?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "as_parsed = {atom: Atom(atom.resname.strip(), atom.name.strip()) for atom in seen}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Validation = collections.namedtuple('Validation', ('false_negative', 'false_positive'))\n",
    "\n",
    "\n",
    "def validate_guesser(guesser, atoms):\n",
    "    false_positive = []\n",
    "    false_negative = []\n",
    "    for expected, read in atoms.iteritems():\n",
    "        guessed_name = guesser(read)\n",
    "        if guessed_name != expected.name:\n",
    "            if expected.name[0] == ' ':\n",
    "                false_negative.append((expected, guessed_name, read))\n",
    "            else:\n",
    "                false_positive.append((expected, guessed_name, read))\n",
    "    return Validation(false_negative, false_positive)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def guess_atom_name(atom):\n",
    "    ions = ('FE', 'AS', 'ZN', 'MG', 'MN', 'CO', 'BR',\n",
    "            'CU', 'TA', 'MO', 'AL', 'BE', 'SE', 'PT',\n",
    "            'EU', 'NI', 'IR', 'RH', 'AU', 'GD', 'RU')\n",
    "    special_hg = ('CMH', 'EMC', 'MBO', 'MMC', 'HGB', 'BE7', 'PMB')\n",
    "    special_cl = ('0QE', 'CPT', 'DCE', 'EAA', 'IMN', 'OCZ', 'OMY', 'OMZ',\n",
    "                  'UN9', '1N1', '2T8', '393', '3MY', 'BMU', 'CLM', 'CP6',\n",
    "                  'DB8', 'DIF', 'EFZ', 'LUR', 'RDC', 'UCL', 'XMM', 'HLT',\n",
    "                  'IRE', 'LCP', 'PCI', 'VGH')\n",
    "    include_pairs = (Atom('OEC', 'CA1'), Atom('PLL', 'PD'), Atom('OEX', 'CA1'))\n",
    "    exclude_pairs = (Atom('C14', 'C14'), Atom('C15', 'C15'),\n",
    "                     Atom('F9F', 'F9F'), Atom('OAN', 'OAN'),\n",
    "                     Atom('BLM', 'NI'), Atom('BZG', 'CO'), Atom('BZG', 'NI'),\n",
    "                     Atom('VNL', 'CO1'), Atom('VNL', 'CO2'),\n",
    "                     Atom('PF5', 'FE1'), Atom('PF5', 'FE2'),\n",
    "                     Atom('UNL', 'UNL'))\n",
    "    if len(atom.name) >= 4:\n",
    "        return atom.name[:4]\n",
    "    if len(atom.name) == 1:\n",
    "        return ' {}  '.format(atom.name)\n",
    "    if ((atom.resname == atom.name\n",
    "             or atom.name[:2] in ions\n",
    "             or atom.name == 'UNK'\n",
    "             or (atom.resname in special_hg and atom.name[:2] == 'HG')\n",
    "             or (atom.resname in special_cl and atom.name[:2] == 'CL')\n",
    "             or Atom(atom.resname, atom.name) in include_pairs)\n",
    "            and Atom(atom.resname, atom.name) not in exclude_pairs):\n",
    "        return '{:<4}'.format(atom.name)\n",
    "    return ' {:<3}'.format(atom.name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we run the validation for the `guess_atom_name` function defined above. The function returns a list of false positives (atom names we guessed were preceded by a space, but are not), and a list of false negative (atom names we guessed were not preceded by a space, but actually are). We run the validation on the whole PDB archive."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "validation = validate_guesser(guess_atom_name, as_parsed)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The guesser generates about 2000 false positives. These error would also be done with the current way MDAnalysis write PDB files as it precedes all atom names shorter than 4 characters with a space."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2225"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(validation.false_positive)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If we exclude the (residue name, atom name) pairs that appear in 5 files or less over the whole PDB, then there is no false positive left."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[val for val in validation.false_positive if seen_per_file[val[0]] > nfiles_threshold]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The guesser generates 117 false negatives. The current version of MDAnalysis writes these pairs correctly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "117"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(validation.false_negative)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "However, these 116 of these pairs appear each in 5 files or less over the whole PDB. The only pair that is more frequent is the ('UNX', 'UNK') pair that appears in 75 files over the PDB archive."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[((Atom(resname='UNX', name=' UNK'), 'UNK ', Atom(resname='UNX', name='UNK')),\n",
       "  75)]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[(val, seen_per_file[val[0]]) for val in validation.false_negative if seen_per_file[val[0]] > nfiles_threshold]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "However, this ('UNX', 'UNK') pair appears without the initial space in 424 files (about 5 times more than the version with the initial space)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "424"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "seen_per_file[Atom('UNX', 'UNK ')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from __future__ import print_function, division\n",
	"import gzip\n",
	"import os\n",
	"import collections"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## What is in the PDB?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"Atom = collections.namedtuple('Atom', 'resname name')\n",
	"seen = collections.defaultdict(int)\n",
	"seen_per_file = collections.defaultdict(int)\n",
	"nfiles = 0\n",
	"for root, dirs, files in os.walk('/coarse/jon/pdb'):\n",
	" for entgz in (f for f in files if f.endswith('.ent.gz')):\n",
	" nfiles += 1\n",
	" seen_here = set()\n",
	" path = os.path.join(root, entgz)\n",
	" with gzip.open(path) as infile:\n",
	" for line in infile:\n",
	" record_type = line[:6]\n",
	" if record_type in ('ATOM ', 'HETATM'):\n",
	" atom_name = line[12:16]\n",
	" res_name = line[17:20]\n",
	" seen[Atom(res_name, atom_name)] += 1\n",
	" seen_here.add(Atom(res_name, atom_name))\n",
	" for pair in seen_here:\n",
	" seen_per_file[pair] += 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Number of files: 114344\n",
	"Number of pairs: 535430\n",
	"Number of records: 904561253\n"
	]
	}
	],
	"source": [
	"print('Number of files:', nfiles)\n",
	"print('Number of pairs:', len(seen))\n",
	"print('Number of records:', sum(seen.itervalues()))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"nfiles_threshold = 5\n",
	"left_flush_all = {key: value for key, value in seen_per_file.items() if len(key[1].strip()) < 4 and key[1][0] != ' ' }\n",
	"left_flush = {key: value for key, value in left_flush_all.items() if value > nfiles_threshold}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Number of problematic pairs: 3499 (0.65349345386%)\n",
	"Number of problematic pairs in more than 5 files: 264\n",
	"Number of problematic records: 78894\n"
	]
	}
	],
	"source": [
	"print('Number of problematic pairs:', len(left_flush_all),\n",
	" '({}%)'.format(100 * len(left_flush_all) / len(seen)))\n",
	"print('Number of problematic pairs in more than {} files:'.format(nfiles_threshold),\n",
	" len(left_flush))\n",
	"print('Number of problematic records:', sum(left_flush_all.itervalues()))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## How to guess the correct atom name?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"as_parsed = {atom: Atom(atom.resname.strip(), atom.name.strip()) for atom in seen}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"Validation = collections.namedtuple('Validation', ('false_negative', 'false_positive'))\n",
	"\n",
	"\n",
	"def validate_guesser(guesser, atoms):\n",
	" false_positive = []\n",
	" false_negative = []\n",
	" for expected, read in atoms.iteritems():\n",
	" guessed_name = guesser(read)\n",
	" if guessed_name != expected.name:\n",
	" if expected.name[0] == ' ':\n",
	" false_negative.append((expected, guessed_name, read))\n",
	" else:\n",
	" false_positive.append((expected, guessed_name, read))\n",
	" return Validation(false_negative, false_positive)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def guess_atom_name(atom):\n",
	" ions = ('FE', 'AS', 'ZN', 'MG', 'MN', 'CO', 'BR',\n",
	" 'CU', 'TA', 'MO', 'AL', 'BE', 'SE', 'PT',\n",
	" 'EU', 'NI', 'IR', 'RH', 'AU', 'GD', 'RU')\n",
	" special_hg = ('CMH', 'EMC', 'MBO', 'MMC', 'HGB', 'BE7', 'PMB')\n",
	" special_cl = ('0QE', 'CPT', 'DCE', 'EAA', 'IMN', 'OCZ', 'OMY', 'OMZ',\n",
	" 'UN9', '1N1', '2T8', '393', '3MY', 'BMU', 'CLM', 'CP6',\n",
	" 'DB8', 'DIF', 'EFZ', 'LUR', 'RDC', 'UCL', 'XMM', 'HLT',\n",
	" 'IRE', 'LCP', 'PCI', 'VGH')\n",
	" include_pairs = (Atom('OEC', 'CA1'), Atom('PLL', 'PD'), Atom('OEX', 'CA1'))\n",
	" exclude_pairs = (Atom('C14', 'C14'), Atom('C15', 'C15'),\n",
	" Atom('F9F', 'F9F'), Atom('OAN', 'OAN'),\n",
	" Atom('BLM', 'NI'), Atom('BZG', 'CO'), Atom('BZG', 'NI'),\n",
	" Atom('VNL', 'CO1'), Atom('VNL', 'CO2'),\n",
	" Atom('PF5', 'FE1'), Atom('PF5', 'FE2'),\n",
	" Atom('UNL', 'UNL'))\n",
	" if len(atom.name) >= 4:\n",
	" return atom.name[:4]\n",
	" if len(atom.name) == 1:\n",
	" return ' {} '.format(atom.name)\n",
	" if ((atom.resname == atom.name\n",
	" or atom.name[:2] in ions\n",
	" or atom.name == 'UNK'\n",
	" or (atom.resname in special_hg and atom.name[:2] == 'HG')\n",
	" or (atom.resname in special_cl and atom.name[:2] == 'CL')\n",
	" or Atom(atom.resname, atom.name) in include_pairs)\n",
	" and Atom(atom.resname, atom.name) not in exclude_pairs):\n",
	" return '{:<4}'.format(atom.name)\n",
	" return ' {:<3}'.format(atom.name)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Here we run the validation for the `guess_atom_name` function defined above. The function returns a list of false positives (atom names we guessed were preceded by a space, but are not), and a list of false negative (atom names we guessed were not preceded by a space, but actually are). We run the validation on the whole PDB archive."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"validation = validate_guesser(guess_atom_name, as_parsed)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"The guesser generates about 2000 false positives. These error would also be done with the current way MDAnalysis write PDB files as it precedes all atom names shorter than 4 characters with a space."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"2225"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(validation.false_positive)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"If we exclude the (residue name, atom name) pairs that appear in 5 files or less over the whole PDB, then there is no false positive left."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[]"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"[val for val in validation.false_positive if seen_per_file[val[0]] > nfiles_threshold]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"The guesser generates 117 false negatives. The current version of MDAnalysis writes these pairs correctly."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"117"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(validation.false_negative)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"However, these 116 of these pairs appear each in 5 files or less over the whole PDB. The only pair that is more frequent is the ('UNX', 'UNK') pair that appears in 75 files over the PDB archive."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[((Atom(resname='UNX', name=' UNK'), 'UNK ', Atom(resname='UNX', name='UNK')),\n",
	" 75)]"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"[(val, seen_per_file[val[0]]) for val in validation.false_negative if seen_per_file[val[0]] > nfiles_threshold]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"However, this ('UNX', 'UNK') pair appears without the initial space in 424 files (about 5 times more than the version with the initial space)."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"424"
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"seen_per_file[Atom('UNX', 'UNK ')]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}