Created
June 25, 2018 05:19
-
-
Save sshojiro/abec8d513f4816a94e06acc49d7bb29e to your computer and use it in GitHub Desktop.
Pipeline for Chemoinformatics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Study of Pipeline for Chemoinformatics\n", | |
"\n", | |
"- Feature Union [example](http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html#sphx-glr-auto-examples-hetero-feature-union-py)\n", | |
"- datasource [url](http://modem.ucsd.edu/adme/databases/databases_logS.htm)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from rdkit import Chem\n", | |
"\n", | |
"from sklearn.base import BaseEstimator, TransformerMixin\n", | |
"\n", | |
"from sklearn.decomposition import TruncatedSVD\n", | |
"from sklearn.feature_extraction import DictVectorizer\n", | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"from sklearn.metrics import classification_report\n", | |
"from sklearn.pipeline import FeatureUnion\n", | |
"from sklearn.pipeline import Pipeline\n", | |
"from sklearn.svm import SVC, SVR\n", | |
"from rdkit.Chem import MACCSkeys\n", | |
"import numpy as np\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"DATA_FILE = '../data/Hou1290/data_set.sdf'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"index_sdf = [idx for idx, mol in enumerate(Chem.SDMolSupplier(DATA_FILE)) if mol is not None]\n", | |
"\n", | |
"mol_array = [mol for mol in Chem.SDMolSupplier(DATA_FILE) if mol is not None]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class ItemSelector(BaseEstimator, TransformerMixin):\n", | |
" def __init__(self, key='objective'):\n", | |
" self.key = key\n", | |
"\n", | |
" def fit(self, x, y=None):\n", | |
" return self\n", | |
"\n", | |
" def transform(self, mol_array):\n", | |
" if self.key == 'objective':\n", | |
" return [m.GetProp('logS') for m in mol_array if m is not None]\n", | |
" else:\n", | |
" return data_dict[self.key]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(1289, 167)" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"class Descriptor(BaseEstimator, TransformerMixin):\n", | |
" def __init__(self, key='maccs'):\n", | |
" self.key = key\n", | |
" def fit(self, x, y=None):\n", | |
" return self\n", | |
" \n", | |
" def transform(self, mol_array):\n", | |
" if self.key == 'maccs':\n", | |
" return np.matrix([list(MACCSkeys.GenMACCSKeys(m)) for m in mol_array])\n", | |
"\n", | |
"ret = Descriptor().fit_transform(mol_array)\n", | |
"ret.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"iselec = ItemSelector()\n", | |
"yobs = iselec.fit_transform(mol_array)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Pipeline(memory=None,\n", | |
" steps=[('gen_fp', Descriptor(key='maccs')), ('svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',\n", | |
" kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))])" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"model = Pipeline([('gen_fp', Descriptor()),\n", | |
" ('svr', SVR())])\n", | |
"\n", | |
"model.fit(mol_array, yobs)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment