Created
November 1, 2011 14:10
-
-
Save aschreyer/1330580 to your computer and use it in GitHub Desktop.
GIST index for OpenEye fingerprints
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE OR REPLACE FUNCTION tanimoto_sim_query(smiles TEXT) | |
RETURNS TABLE (molregno INTEGER, similarity REAL) AS | |
$BODY$ | |
SELECT molregno, | |
-- Tanimoto similarity | |
fp % make_circular_fp($1) | |
FROM fps | |
-- Boolean operator returning true if the Tanimoto similarity | |
-- is above the user-defined limit | |
WHERE fp %? make_circular_fp($1) | |
-- KNN-GIST order by operator for Tanimoto similarity | |
ORDER BY fp <%> make_circular_fp($1) LIMIT 10; | |
$BODY$ | |
LANGUAGE sql IMMUTABLE STRICT; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SELECT * FROM tanimoto_sim_query('CC1([C@@H](N2[C@H](S1(=O)=O)[C@@H](C2=O)NC(=O)C(F)(F)F)C(=O)N3CCC[C@@H]3C(=O)[O-])C'); | |
molregno | similarity | |
----------+------------ | |
118466 | 1 | |
118261 | 1 | |
119013 | 0.543478 | |
117667 | 0.543478 | |
(4 rows) | |
Time: 580.303 ms | |
cryst=# SELECT * FROM tanimoto_sim_query('c1ccc(cc1)c2cccc(c2)O'); | |
molregno | similarity | |
----------+------------ | |
203542 | 1 | |
513094 | 0.8125 | |
107248 | 0.75 | |
513095 | 0.722222 | |
513093 | 0.684211 | |
625818 | 0.666667 | |
625897 | 0.666667 | |
203018 | 0.65 | |
625712 | 0.636364 | |
512953 | 0.619048 | |
(10 rows) | |
Time: 2617.169 ms | |
cryst=# SELECT * FROM tanimoto_sim_query('CCOC(=O)N1CCN(CC1)C(=O)c2ccc3c(c2)ncn3C4CCCC4'); | |
molregno | similarity | |
----------+------------ | |
822588 | 1 | |
745437 | 0.695652 | |
980747 | 0.536585 | |
802951 | 0.530612 | |
864727 | 0.530612 | |
(5 rows) | |
Time: 480.698 ms | |
cryst=# SELECT * FROM tanimoto_sim_query('c1cc(ccc1S(=O)(=O)N2C3Cc4c(cn[nH]4)C2CC(C3)/C=CC(F)(F)F)Cl'); | |
molregno | similarity | |
----------+------------ | |
581431 | 1 | |
581422 | 1 | |
581393 | 0.813953 | |
581392 | 0.813953 | |
581308 | 0.767442 | |
581342 | 0.638298 | |
581309 | 0.638298 | |
581301 | 0.638298 | |
581380 | 0.625 | |
581389 | 0.612245 | |
(10 rows) | |
Time: 82.379 ms | |
cryst=# SELECT * FROM tanimoto_sim_query('CN(C)C(=O)c1ccc(cc1)CCC(COc2ccc(cc2)c3cccc(c3)[N+](=O)[O-])N4C(=O)CCC4=O'); | |
molregno | similarity | |
----------+------------ | |
215985 | 1 | |
215699 | 0.846154 | |
215392 | 0.811321 | |
216366 | 0.796296 | |
215950 | 0.773585 | |
215597 | 0.730769 | |
215252 | 0.689655 | |
215859 | 0.655172 | |
215909 | 0.65 | |
215353 | 0.627119 | |
(10 rows) | |
Time: 142.657 ms |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SELECT set_oefp_similarity_limit(0.5, 'tanimoto'); | |
set_oefp_similarity_limit | |
--------------------------- | |
0.5 | |
(1 row) | |
-- TVERSKY PARAMETER | |
SELECT set_oefp_similarity_limit(0.5, 'tversky_alpha'); | |
set_oefp_similarity_limit | |
--------------------------- | |
0.5 | |
(1 row) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SELECT show_oefp_similarity_limit('tversky_beta') as tversky_alpha, | |
show_oefp_similarity_limit('tversky_beta') as tversky_beta; | |
tversky_alpha | tversky_beta | |
---------------+-------------- | |
1 | 1 | |
(1 row) | |
SELECT molregno, fp %^ make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') as tversky | |
FROM fps | |
WHERE fp %^? make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') | |
ORDER BY fp <%^> make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') LIMIT 10; | |
molregno | tversky | |
----------+---------- | |
1 | 1 | |
54 | 0.756098 | |
79 | 0.725 | |
7 | 0.714286 | |
6 | 0.707317 | |
5 | 0.651163 | |
81 | 0.627907 | |
8 | 0.619048 | |
2 | 0.613636 | |
29 | 0.590909 | |
(10 rows) | |
Time: 119.633 ms | |
-- Exactly the same as Tversky with alpha=1 and beta=1 | |
SELECT molregno, fp % make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') as tanimoto | |
FROM fps | |
WHERE fp %? make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') | |
ORDER BY fp <%> make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') LIMIT 10; | |
molregno | tanimoto | |
----------+---------- | |
1 | 1 | |
54 | 0.756098 | |
79 | 0.725 | |
7 | 0.714286 | |
6 | 0.707317 | |
5 | 0.651163 | |
81 | 0.627907 | |
8 | 0.619048 | |
2 | 0.613636 | |
29 | 0.590909 | |
(10 rows) | |
Time: 119.422 ms | |
-- Has to be done in two steps! | |
SELECT set_oefp_similarity_limit(0.5,'tversky_alpha') as tversky_alpha; | |
tversky_alpha | |
--------------- | |
0.5 | |
(1 row) | |
SELECT set_oefp_similarity_limit(0.5,'tversky_beta') as tversky_beta; | |
tversky_beta | |
-------------- | |
0.5 | |
(1 row) | |
SELECT molregno, fp %^ make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') as tversky | |
FROM fps | |
WHERE fp %^? make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') | |
ORDER BY fp <%^> make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') LIMIT 10; | |
molregno | tversky | |
----------+---------- | |
1 | 1 | |
54 | 0.861111 | |
79 | 0.84058 | |
7 | 0.833333 | |
6 | 0.828571 | |
5 | 0.788732 | |
81 | 0.771429 | |
8 | 0.764706 | |
2 | 0.760563 | |
29 | 0.742857 | |
(10 rows) | |
Time: 628.929 ms | |
-- Same as Tversky with alpha=0.5 and beta=0.5 | |
SELECT molregno, fp # make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') as dice | |
FROM fps | |
WHERE fp #? make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') | |
ORDER BY fp <#> make_circular_fp('Cc1cc(ccc1C(=O)c2ccccc2Cl)n3c(=O)[nH]c(=O)cn3') LIMIT 10; | |
molregno | dice | |
----------+---------- | |
1 | 1 | |
54 | 0.861111 | |
79 | 0.84058 | |
7 | 0.833333 | |
6 | 0.828571 | |
5 | 0.788732 | |
81 | 0.771429 | |
8 | 0.764706 | |
2 | 0.760563 | |
29 | 0.742857 | |
(10 rows) | |
Time: 138.255 ms |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SELECT smiles FROM chembl WHERE molregno = (SELECT trunc(random() * 1000000 + 1)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment