Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Convert mmCIF files to PDB format using biopython
#!/usr/bin/env python
Script to convert mmCIF files to PDB format.
usage: python ciffile [pdbfile]
Requires python BioPython (`pip install biopython`). It should work with recent version of python 2 or 3.
@author Spencer Bliven <>
import sys
import argparse
import logging
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB import PDBIO
def int_to_chain(i,base=62):
int_to_chain(int,int) -> str
Converts a positive integer to a chain ID. Chain IDs include uppercase
characters, numbers, and optionally lowercase letters.
i = a positive integer to convert
base = the alphabet size to include. Typically 36 or 62.
if i < 0:
raise ValueError("positive integers only")
if base < 0 or 62 < base:
raise ValueError("Invalid base")
quot = int(i)//base
rem = i%base
if rem < 26:
letter = chr( ord("A") + rem)
elif rem < 36:
letter = str( rem-26)
letter = chr( ord("a") + rem - 36)
if quot == 0:
return letter
return int_to_chain(quot-1,base) + letter
class OutOfChainsError(Exception): pass
def rename_chains(structure):
"""Renames chains to be one-letter chains
Existing one-letter chains will be kept. Multi-letter chains will be truncated
or renamed to the next available letter of the alphabet.
If more than 62 chains are present in the structure, raises an OutOfChainsError
Returns a map between new and old chain IDs, as well as modifying the input structure
next_chain = 0 #
# single-letters stay the same
chainmap = { for c in structure.get_chains() if len( == 1}
for o in structure.get_chains():
if len( != 1:
if[0] not in chainmap:
chainmap[[0]] = =[0]
c = int_to_chain(next_chain)
while c in chainmap:
next_chain += 1
c = int_to_chain(next_chain)
if next_chain >= 62:
raise OutOfChainsError()
chainmap[c] = = c
return chainmap
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Convert mmCIF to PDB format')
parser.add_argument("ciffile",help="mmCIF input file")
parser.add_argument("pdbfile",nargs="?",help="PDB output file. Default based on CIF filename")
parser.add_argument("-v","--verbose", help="Long messages",
dest="verbose",default=False, action="store_true")
args = parser.parse_args()
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG if args.verbose else logging.WARN)
ciffile = args.ciffile
pdbfile = args.pdbfile or ciffile+".pdb"
#Not sure why biopython needs this to read a cif file
strucid = ciffile[:4] if len(ciffile)>4 else "1xxx"
# Read file
parser = MMCIFParser()
structure = parser.get_structure(strucid, ciffile)
# rename long chains
chainmap = rename_chains(structure)
except OutOfChainsError:
logging.error("Too many chains to represent in PDB format")
if args.verbose:
for new,old in chainmap.items():
if new != old:"Renaming chain {0} to {1}".format(old,new))
#Write PDB
io = PDBIO()
#TODO What happens with large structures?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment