Rajarshi Guha rajarshi

## gsva-crash-test-data.csv

          
            C01
            C02
            C03
            C04
            C05
            C06

            
              4193
              58.52690209
              19.79238528
              105.52763504
              94.95405256
              31.71595781
              118.67890655

            
              1644
              97.88520438
              32.16221987
              80.88268546
              99.14777684
              29.55725639
              90.61186379

            
              598
              2.87791536
              1.25763403
              49.85402356
              2.28738267
              2.77951966
              65.14421558

            
              7153
              10.14441154
              13.59621158
              57.16808301
              7.37447958
              2.33377616
              77.48738463

            
              4790
              22.75508944
              42.99521409
              109.28704983
              36.73099591
              34.4355488
              89.54434147

            
              6774
              4.09136397
              7.24804356
              108.45800573
              40.2471083
              4.58565529
              106.56955414

            
              3156
              78.35995401
              54.8259949
              114.3780661
              87.60616517
              37.04524797
              104.79968629

            
              5916
              23.89803961
              9.43186629
              84.96080843
              13.23830152
              22.095928
              68.91517787

            
              2932
              27.36320561
              1.68586442
              112.55169248
              21.23236686
              3.56942032
              77.29558595

## rndsmi.py
import selfies, random, sys

N = 1000
M = 500
selfies.set_semantic_constraints("octet_rule")
alphabet= list(selfies.get_semantic_robust_alphabet())
alphabet = list(filter(lambda x: '+' not in x and '-' not in x, alphabet))

rnd_smi = set()
while len(rnd_smi) < N:

## simsearch.R
library(rcdk)
library(fingerprint)
query_smi <- "CCCCCCCC"
hmdb <- load.molecules("smpdb_structures/compounds-1.sdf")
junk <- lapply(hmdb, function(x) set.title(x, get.property(x, "DATABASE_ID")))
query_fp <- get.fingerprint(parse.smiles(query_smi)[[1]])
hmdb_fp <- lapply(hmdb, get.fingerprint)
sm <- fp.sim.matrix(hmdb_fp, list(query_fp))
rownames(sm) <- sapply(hmdb, get.property, "DATABASE_ID")
sm[ which.max(sm[,1]), ]

## moliter.R
library(rcdk)
f <- function(mol) {
  tmp <- get.murcko.fragments(list(mol), min.frag.size=3)[[1]]
   cat("\rProcessed molregno",get.property(mol, 'cdk:Title'), 'which had',
       length(tmp$rings), 'rings and', length(tmp$frameworks), 'frameworks')git
  return(tmp)
}
miter <- iload.molecules('/Users/guha/chembl28_10k.smi', type='smi')
frags <- foreach(mol=miter) %do% f(mol)

## foo.py
import pandas as pd

if __name__ == '__main__':
    d = {'col1': [1, 2, 3],
         'col2': ['{abc defg', '{xyz pqr}', '{nan, foo}']}
    df = pd.DataFrame(d)
    print(df)

    df['col3'] = df['col2'].str.strip("{|}").astype('str')

## gist:4ab8c1024a420ecccaa6777ef31983d0
df = df.iloc[: , 1:]

## zoom
https://mit.zoom.us/j/94364555006?pwd=bFBGeGlvY0cxV0lTRzZPMWZDdDJNQT09#success

## type_hint.py
from typing import List, Union


class Unit:
    def __init__(self):
        pass


class Result:
    def __init__(self, value: float, unit: Unit):

## gist:2fa1fbe32077cbf326851a04ef2d5c23
https://zoom.us/w/94280441863?tk=FwYQPHrVQzCxdXDQjONgSnw7TnOH2IfrzqdF7Oae7Cg.DQIAAAAV841gBxZsakpEWVQtSFR4Q3NXb01CTzhNV2lRAAAAAAAAAAAAAAAAAAAAAAAAAAAA&pwd=dGVxZ2g3c01PNis5RGxxVVlRQks0Zz09

## fda_drug_cores.smi
C1=C2CCCCC2C2CCC3C(c4cccnc4)=CCC3C2C1
C1=C[C@H](N[C@H]2CC[C@@H](O[C@H]3CC[C@@H](O[C@H]4CCCOC4)OC3)OC2)CCC1
c1ccccc1
O=c1oc2ccccc2cc1Cc1ccccc1
c1ccc2c(c1)Nc1ccccc1S2
c1ccc2c(c1)Nc1ccccc1S2
c1ccccc1
c1nncs1
O=C(NC1CCCCC1)NS(=O)(=O)c1ccccc1
c1ccc2c(c1)Sc1ccccc1N2CCCN1CCNCC1
	C01	C02	C03	C04	C05	C06
4193	58.52690209	19.79238528	105.52763504	94.95405256	31.71595781	118.67890655
1644	97.88520438	32.16221987	80.88268546	99.14777684	29.55725639	90.61186379
598	2.87791536	1.25763403	49.85402356	2.28738267	2.77951966	65.14421558
7153	10.14441154	13.59621158	57.16808301	7.37447958	2.33377616	77.48738463
4790	22.75508944	42.99521409	109.28704983	36.73099591	34.4355488	89.54434147
6774	4.09136397	7.24804356	108.45800573	40.2471083	4.58565529	106.56955414
3156	78.35995401	54.8259949	114.3780661	87.60616517	37.04524797	104.79968629
5916	23.89803961	9.43186629	84.96080843	13.23830152	22.095928	68.91517787
2932	27.36320561	1.68586442	112.55169248	21.23236686	3.56942032	77.29558595
	import selfies, random, sys

	N = 1000
	M = 500
	selfies.set_semantic_constraints("octet_rule")
	alphabet= list(selfies.get_semantic_robust_alphabet())
	alphabet = list(filter(lambda x: '+' not in x and '-' not in x, alphabet))

	rnd_smi = set()
	while len(rnd_smi) < N:
	library(rcdk)
	library(fingerprint)
	query_smi <- "CCCCCCCC"
	hmdb <- load.molecules("smpdb_structures/compounds-1.sdf")
	junk <- lapply(hmdb, function(x) set.title(x, get.property(x, "DATABASE_ID")))
	query_fp <- get.fingerprint(parse.smiles(query_smi)[[1]])
	hmdb_fp <- lapply(hmdb, get.fingerprint)
	sm <- fp.sim.matrix(hmdb_fp, list(query_fp))
	rownames(sm) <- sapply(hmdb, get.property, "DATABASE_ID")
	sm[ which.max(sm[,1]), ]
	library(rcdk)
	f <- function(mol) {
	tmp <- get.murcko.fragments(list(mol), min.frag.size=3)[[1]]
	cat("\rProcessed molregno",get.property(mol, 'cdk:Title'), 'which had',
	length(tmp$rings), 'rings and', length(tmp$frameworks), 'frameworks')git
	return(tmp)
	}
	miter <- iload.molecules('/Users/guha/chembl28_10k.smi', type='smi')
	frags <- foreach(mol=miter) %do% f(mol)
	import pandas as pd

	if __name__ == '__main__':
	d = {'col1': [1, 2, 3],
	'col2': ['{abc defg', '{xyz pqr}', '{nan, foo}']}
	df = pd.DataFrame(d)
	print(df)

	df['col3'] = df['col2'].str.strip("{\|}").astype('str')
	from typing import List, Union


	class Unit:
	def __init__(self):
	pass


	class Result:
	def __init__(self, value: float, unit: Unit):
	C1=C2CCCCC2C2CCC3C(c4cccnc4)=CCC3C2C1
	C1=C[C@H](N[C@H]2CC[C@@H](O[C@H]3CC[C@@H](O[C@H]4CCCOC4)OC3)OC2)CCC1
	c1ccccc1
	O=c1oc2ccccc2cc1Cc1ccccc1
	c1ccc2c(c1)Nc1ccccc1S2
	c1ccc2c(c1)Nc1ccccc1S2
	c1ccccc1
	c1nncs1
	O=C(NC1CCCCC1)NS(=O)(=O)c1ccccc1
	c1ccc2c(c1)Sc1ccccc1N2CCCN1CCNCC1