Skip to content

Instantly share code, notes, and snippets.

@sshojiro
Created June 25, 2018 05:19
Show Gist options
  • Save sshojiro/abec8d513f4816a94e06acc49d7bb29e to your computer and use it in GitHub Desktop.
Save sshojiro/abec8d513f4816a94e06acc49d7bb29e to your computer and use it in GitHub Desktop.
Pipeline for Chemoinformatics
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Study of Pipeline for Chemoinformatics\n",
"\n",
"- Feature Union [example](http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html#sphx-glr-auto-examples-hetero-feature-union-py)\n",
"- datasource [url](http://modem.ucsd.edu/adme/databases/databases_logS.htm)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from rdkit import Chem\n",
"\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"\n",
"from sklearn.decomposition import TruncatedSVD\n",
"from sklearn.feature_extraction import DictVectorizer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.pipeline import FeatureUnion\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.svm import SVC, SVR\n",
"from rdkit.Chem import MACCSkeys\n",
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"DATA_FILE = '../data/Hou1290/data_set.sdf'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"index_sdf = [idx for idx, mol in enumerate(Chem.SDMolSupplier(DATA_FILE)) if mol is not None]\n",
"\n",
"mol_array = [mol for mol in Chem.SDMolSupplier(DATA_FILE) if mol is not None]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"class ItemSelector(BaseEstimator, TransformerMixin):\n",
" def __init__(self, key='objective'):\n",
" self.key = key\n",
"\n",
" def fit(self, x, y=None):\n",
" return self\n",
"\n",
" def transform(self, mol_array):\n",
" if self.key == 'objective':\n",
" return [m.GetProp('logS') for m in mol_array if m is not None]\n",
" else:\n",
" return data_dict[self.key]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1289, 167)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class Descriptor(BaseEstimator, TransformerMixin):\n",
" def __init__(self, key='maccs'):\n",
" self.key = key\n",
" def fit(self, x, y=None):\n",
" return self\n",
" \n",
" def transform(self, mol_array):\n",
" if self.key == 'maccs':\n",
" return np.matrix([list(MACCSkeys.GenMACCSKeys(m)) for m in mol_array])\n",
"\n",
"ret = Descriptor().fit_transform(mol_array)\n",
"ret.shape"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"iselec = ItemSelector()\n",
"yobs = iselec.fit_transform(mol_array)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(memory=None,\n",
" steps=[('gen_fp', Descriptor(key='maccs')), ('svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',\n",
" kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = Pipeline([('gen_fp', Descriptor()),\n",
" ('svr', SVR())])\n",
"\n",
"model.fit(mol_array, yobs)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment