Skip to content

Instantly share code, notes, and snippets.

@PatWalters
Created March 22, 2021 12:48
Show Gist options
  • Save PatWalters/ca41289a6990ebf7af1e5c44e188fccd to your computer and use it in GitHub Desktop.
Save PatWalters/ca41289a6990ebf7af1e5c44e188fccd to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import sys
import dask.dataframe as dd
import pandas as pd
from rdkit import Chem
from rdkit.Chem.Descriptors import MolWt
from rdkit.Chem.rdMolDescriptors import BCUT2D
import time
# -- molecular weight functions
def calc_mw(smi):
mol = Chem.MolFromSmiles(smi)
return MolWt(mol)
def mw_df(df):
return df.SMILES.apply(calc_mw)
# -- bcut functions
def bcut_df(df):
return df.SMILES.apply(calc_bcut)
def calc_bcut(smi):
mol = Chem.MolFromSmiles(smi)
return BCUT2D(mol)
def main():
start = time.time()
df = pd.read_csv(sys.argv[1],sep=" ",names=["SMILES","Name"])
ddf = dd.from_pandas(df,npartitions=16)
ddf['MW'] = ddf.map_partitions(mw_df,meta='float').compute(scheduler='processes')
# this works
# ddf['BCUT'] = ddf.map_partitions(bcut_df,meta='float').compute()
# this does not
ddf['BCUT'] = ddf.map_partitions(bcut_df,meta='float').compute(scheduler='processes')
print(time.time()-start)
print(ddf.head())
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment