Skip to content

Instantly share code, notes, and snippets.

@matteoferla
Created November 13, 2023 13:38
Show Gist options
  • Save matteoferla/45b98a4de5849ff1f413f5f14e5a4292 to your computer and use it in GitHub Desktop.
Save matteoferla/45b98a4de5849ff1f413f5f14e5a4292 to your computer and use it in GitHub Desktop.
Testing cset-uload
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "b00a8c4f-11fa-4127-84ee-d57e2a6982ca",
"metadata": {},
"outputs": [],
"source": [
"from rdkit import Chem, rdBase\n",
"from rdkit.Chem import AllChem, PandasTools, Draw\n",
"from rdkit.Chem.Draw import IPythonConsole\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3f2dada0-8d63-4718-86cb-81538b03d2b8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20638\n",
"100\n",
"100\n"
]
}
],
"source": [
"# Wikipedia compounds view counts as a test set\n",
"\n",
"compounds = pd.read_csv('https://raw.githubusercontent.com/matteoferla/Wikipedian-compounds/main/compounds.csv').rename(columns=dict(combined_SMILES='SMILES'))\n",
"print(len(compounds))\n",
"compounds = compounds.loc[(~compounds.SMILES.isna()) & (compounds.SMILES != '}}')].head(100).copy()\n",
"print(len(compounds))\n",
"with rdBase.BlockLogs() as blocker:\n",
" PandasTools.AddMoleculeColumnToFrame(compounds, 'SMILES', 'mol')\n",
" compounds = compounds.loc[compounds.mol.astype(bool)].copy()\n",
" compounds.mol.apply(AllChem.EmbedMolecule)\n",
"print(len(compounds))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "8e148cd7-c31e-44d8-b8e9-1fb1a43af82e",
"metadata": {},
"outputs": [],
"source": [
"from gist_import import GistImporter\n",
"\n",
"# fu for fragalysis upload\n",
"fmodule = GistImporter.from_github('https://raw.githubusercontent.com/matteoferla/Fragment-hit-follow-up-chemistry/main/followup/prep_fragalysis.py')\n",
"prep = fmodule['prep']\n",
"generate_header = fmodule['generate_header']"
]
},
{
"cell_type": "markdown",
"id": "69a07ce7-8ce9-4fc9-9962-b21fb26da93c",
"metadata": {},
"source": [
"# Test 1: NUDT7A\n",
"\n",
"Killian's NUDT7A chosen as testcase."
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "88e212ba-d3d9-4c13-af49-cc758a9fe6d4",
"metadata": {},
"outputs": [],
"source": [
"compounds = compounds.loc[compounds.SMILES.apply(len) < 200].copy()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "18e56cc1-7f4b-4562-92af-8d6aaff19f79",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['title', 'is_element', 'monthly_views', 'SMILES', 'mol'], dtype='object')"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"compounds.columns"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "0745ef5e-ce4f-4d81-b7c7-073cc2412a9c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[12:22:37] Molecule does not have explicit Hs. Consider calling AddHs()\n"
]
}
],
"source": [
"test_name = 'MF-test231113_base'\n",
"\n",
"header: Chem.Mol = generate_header(method=test_name,\n",
" ref_url='https://www.example.com',\n",
" submitter_name='matteo',\n",
" submitter_email='matteo.ferla@stats.ox.ac.uk',\n",
" submitter_institution='Ox',\n",
" extras={'monthly_views': 'monthly views in Wikipedia'}\n",
" )\n",
"\n",
"prep(compounds, \n",
" header, mol_col='mol', \n",
" name_col='title',\n",
" outfile=f'{test_name}.sdf',\n",
" ref_mol_names='x0140_1', # unless alread present as ref_mols column\n",
" ref_pdb_name='x0140_1', # ditto\n",
" extras=['monthly_views'] # extras\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "07aa2289-2ccf-4d71-aa07-d1c387886c13",
"metadata": {},
"outputs": [],
"source": [
"import operator\n",
"compounds['title'] = compounds.title.str.replace(r'\\W', '_', regex=True).apply(operator.itemgetter(slice(None, 20)))"
]
},
{
"cell_type": "markdown",
"id": "778981d5-9146-479a-90a6-86b3bdc3baf0",
"metadata": {},
"source": [
"# Test 2: Words\n",
"\n",
"Words"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "0432c94b-b605-45de-8cb9-275b4955f3e8",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[12:45:46] Molecule does not have explicit Hs. Consider calling AddHs()\n"
]
}
],
"source": [
"test_name = 'MF-test231113_words'\n",
"\n",
"header: Chem.Mol = generate_header(method=test_name,\n",
" ref_url='https://www.example.com',\n",
" submitter_name='matteo',\n",
" submitter_email='matteo.ferla@stats.ox.ac.uk',\n",
" submitter_institution='Ox',\n",
" extras={'wordy_views': 'monthly views in Wikipedia', 'monthly_views':'monthly views in Wikipedia'})\n",
"\n",
"\n",
"compounds['wordy_views'] = compounds.monthly_views.apply('Viewed {} times a month'.format)\n",
"\n",
"prep(compounds, header, \n",
" mol_col='mol', \n",
" name_col='title',\n",
" outfile=f'{test_name}.sdf',\n",
" ref_mol_names='x0140_1', # unless alread present as ref_mols column\n",
" ref_pdb_name='x0140_1', # ditto\n",
" extras=['wordy_views', 'monthly_views'], # extras\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "f5013201-a0e4-4f01-bb39-3d4a1acfac6d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"254"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(compounds.iloc[34].SMILES)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bffc3db8-cda5-4e6c-8e18-4b2f6ebc84b3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:py310]",
"language": "python",
"name": "conda-env-py310-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment