Skip to content

Instantly share code, notes, and snippets.

@baoilleach
Created April 30, 2009 08:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save baoilleach/104351 to your computer and use it in GitHub Desktop.
Save baoilleach/104351 to your computer and use it in GitHub Desktop.
import urllib
import random
import pdb
import pybel
def getfromPubChem(N = 100, filenames = None):
"""Download N random PubChem molecules as 2D and 3D"""
if filenames == None:
filenames = ["2Ddataset.sdf", "3Ddataset.sdf"]
assert len(filenames) == 2
baseurl = ("http://pubchem.ncbi.nlm.nih.gov/summary/"
"summary.cgi?cid=%d&disopt=%sDisplaySDF")
tot = 0
ans = [[], []]
while tot < N:
cid = random.randint(1, 24000000)
try:
data3d = urllib.urlopen(baseurl % (cid, "3D")).read().rstrip()
if data3d.find("<html>") >= 0:
continue
mol = pybel.readstring("sdf", data3d)
## pdb.set_trace()
if mol.data['PUBCHEM_COMPONENT_COUNT'] != '1':
continue
# Must have stereo
smi = mol.write("smi").rstrip()
if smi.find("@") < 0:
continue
data2d = urllib.urlopen(baseurl % (cid, "")).read().rstrip()
except IOError:
pass
else:
ans[1].append(data3d)
ans[0].append(data2d)
tot += 1
for i in range(2):
print >> open(filenames[i], "w"), "\n".join(ans[i])
if __name__ == "__main__":
N = 100
getfromPubChem(N)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment