Skip to content

Instantly share code, notes, and snippets.

@aschreyer
Created November 1, 2011 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aschreyer/1330580 to your computer and use it in GitHub Desktop.
Save aschreyer/1330580 to your computer and use it in GitHub Desktop.
GIST index for OpenEye fingerprints
CREATE OR REPLACE FUNCTION tanimoto_sim_query(smiles TEXT)
RETURNS TABLE (molregno INTEGER, similarity REAL) AS
$BODY$
SELECT molregno,
-- Tanimoto similarity
fp % make_circular_fp($1)
FROM fps
-- Boolean operator returning true if the Tanimoto similarity
-- is above the user-defined limit
WHERE fp %? make_circular_fp($1)
-- KNN-GIST order by operator for Tanimoto similarity
ORDER BY fp <%> make_circular_fp($1) LIMIT 10;
$BODY$
LANGUAGE sql IMMUTABLE STRICT;
SELECT * FROM tanimoto_sim_query('CC1([C@@H](N2[C@H](S1(=O)=O)[C@@H](C2=O)NC(=O)C(F)(F)F)C(=O)N3CCC[C@@H]3C(=O)[O-])C');
molregno | similarity
----------+------------
118466 | 1
118261 | 1
119013 | 0.543478
117667 | 0.543478
(4 rows)
Time: 580.303 ms
cryst=# SELECT * FROM tanimoto_sim_query('c1ccc(cc1)c2cccc(c2)O');
molregno | similarity
----------+------------
203542 | 1
513094 | 0.8125
107248 | 0.75
513095 | 0.722222
513093 | 0.684211
625818 | 0.666667
625897 | 0.666667
203018 | 0.65
625712 | 0.636364
512953 | 0.619048
(10 rows)
Time: 2617.169 ms
cryst=# SELECT * FROM tanimoto_sim_query('CCOC(=O)N1CCN(CC1)C(=O)c2ccc3c(c2)ncn3C4CCCC4');
molregno | similarity
----------+------------
822588 | 1
745437 | 0.695652
980747 | 0.536585
802951 | 0.530612
864727 | 0.530612
(5 rows)
Time: 480.698 ms
cryst=# SELECT * FROM tanimoto_sim_query('c1cc(ccc1S(=O)(=O)N2C3Cc4c(cn[nH]4)C2CC(C3)/C=CC(F)(F)F)Cl');
molregno | similarity
----------+------------
581431 | 1
581422 | 1
581393 | 0.813953
581392 | 0.813953
581308 | 0.767442
581342 | 0.638298
581309 | 0.638298
581301 | 0.638298
581380 | 0.625
581389 | 0.612245
(10 rows)
Time: 82.379 ms
cryst=# SELECT * FROM tanimoto_sim_query('CN(C)C(=O)c1ccc(cc1)CCC(COc2ccc(cc2)c3cccc(c3)[N+](=O)[O-])N4C(=O)CCC4=O');
molregno | similarity
----------+------------
215985 | 1
215699 | 0.846154
215392 | 0.811321
216366 | 0.796296
215950 | 0.773585
215597 | 0.730769
215252 | 0.689655
215859 | 0.655172
215909 | 0.65
215353 | 0.627119
(10 rows)
Time: 142.657 ms
SELECT set_oefp_similarity_limit(0.5, 'tanimoto');
set_oefp_similarity_limit
---------------------------
0.5
(1 row)
-- TVERSKY PARAMETER
SELECT set_oefp_similarity_limit(0.5, 'tversky_alpha');
set_oefp_similarity_limit
---------------------------
0.5
(1 row)
SELECT show_oefp_similarity_limit('tversky_beta') as tversky_alpha,
show_oefp_similarity_limit('tversky_beta') as tversky_beta;
tversky_alpha | tversky_beta
---------------+--------------
1 | 1
(1 row)
SELECT molregno, fp %^ make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') as tversky
FROM fps
WHERE fp %^? make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3')
ORDER BY fp <%^> make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') LIMIT 10;
molregno | tversky
----------+----------
1 | 1
54 | 0.756098
79 | 0.725
7 | 0.714286
6 | 0.707317
5 | 0.651163
81 | 0.627907
8 | 0.619048
2 | 0.613636
29 | 0.590909
(10 rows)
Time: 119.633 ms
-- Exactly the same as Tversky with alpha=1 and beta=1
SELECT molregno, fp % make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') as tanimoto
FROM fps
WHERE fp %? make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3')
ORDER BY fp <%> make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') LIMIT 10;
molregno | tanimoto
----------+----------
1 | 1
54 | 0.756098
79 | 0.725
7 | 0.714286
6 | 0.707317
5 | 0.651163
81 | 0.627907
8 | 0.619048
2 | 0.613636
29 | 0.590909
(10 rows)
Time: 119.422 ms
-- Has to be done in two steps!
SELECT set_oefp_similarity_limit(0.5,'tversky_alpha') as tversky_alpha;
tversky_alpha
---------------
0.5
(1 row)
SELECT set_oefp_similarity_limit(0.5,'tversky_beta') as tversky_beta;
tversky_beta
--------------
0.5
(1 row)
SELECT molregno, fp %^ make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') as tversky
FROM fps
WHERE fp %^? make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3')
ORDER BY fp <%^> make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') LIMIT 10;
molregno | tversky
----------+----------
1 | 1
54 | 0.861111
79 | 0.84058
7 | 0.833333
6 | 0.828571
5 | 0.788732
81 | 0.771429
8 | 0.764706
2 | 0.760563
29 | 0.742857
(10 rows)
Time: 628.929 ms
-- Same as Tversky with alpha=0.5 and beta=0.5
SELECT molregno, fp # make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') as dice
FROM fps
WHERE fp #? make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3')
ORDER BY fp <#> make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') LIMIT 10;
molregno | dice
----------+----------
1 | 1
54 | 0.861111
79 | 0.84058
7 | 0.833333
6 | 0.828571
5 | 0.788732
81 | 0.771429
8 | 0.764706
2 | 0.760563
29 | 0.742857
(10 rows)
Time: 138.255 ms
SELECT smiles FROM chembl WHERE molregno = (SELECT trunc(random() * 1000000 + 1));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment