Last active
April 20, 2022 04:37
-
-
Save adelenelai/2b5d23dcf2c40f157075a97e57fcb320 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "f214cd96", | |
"metadata": {}, | |
"source": [ | |
"## Gist 2 of 3 - Fingerprints\n", | |
"\n", | |
"\n", | |
"https://adelenel.ai/sugarfreecoconut\n", | |
"\n", | |
"20/04/2022" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "45895cf6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pymongo import MongoClient\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "e82a3ecc", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"client = MongoClient('localhost',27017)\n", | |
"db = client.COCONUT_2021_11" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "e098cc23", | |
"metadata": {}, | |
"source": [ | |
"### Example data from one document" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "fba9b6de", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'_id': ObjectId('61a4de12c52bda1e67b75964'),\n", | |
" 'coconut_id': 'CNP0293916',\n", | |
" 'contains_sugar': 0,\n", | |
" 'heavy_atom_number': 31,\n", | |
" 'inchi': 'InChI=1S/C25H20O6/c1-30-22(29)12-17(15-8-4-2-5-9-15)23-18(26)13-19(27)24-20(28)14-21(31-25(23)24)16-10-6-3-7-11-16/h2-11,13-14,17,26-27H,12H2,1H3',\n", | |
" 'inchikey': 'AAAAWQOPBUPWEV-UHFFFAOYSA-N',\n", | |
" 'smiles': '[H]OC1=C2C(OC(C=3C([H])=C([H])C([H])=C([H])C3[H])=C([H])C2=O)=C(C(O[H])=C1[H])C([H])(C=4C([H])=C([H])C([H])=C([H])C4[H])C([H])([H])C(=O)OC([H])([H])[H]',\n", | |
" 'unique_smiles': 'O=C1C=C(OC=2C1=C(O)C=C(O)C2C(C=3C=CC=CC3)CC(=O)OC)C=4C=CC=CC4',\n", | |
" 'clean_smiles': 'O=C1C=C(OC=2C1=C(O)C=C(O)C2C(C=3C=CC=CC3)CC(=O)OC)C=4C=CC=CC4',\n", | |
" 'sugar_free_smiles': 'O=C1C=C(OC=2C1=C(O)C=C(O)C2C(C=3C=CC=CC3)CC(=O)OC)C=4C=CC=CC4',\n", | |
" 'deep_smiles': '',\n", | |
" 'name': 'methyl 3-(5,7-dihydroxy-4-oxo-2-phenyl-4H-chromen-8-yl)-3-phenylpropanoate',\n", | |
" 'nameTrustLevel': 0,\n", | |
" 'annotationLevel': 1,\n", | |
" 'synonyms': [],\n", | |
" 'cas': '1574471-61-6',\n", | |
" 'iupac_name': 'methyl 3-(5,7-dihydroxy-4-oxo-2-phenyl-4H-chromen-8-yl)-3-phenylpropanoate',\n", | |
" 'contains_ring_sugars': False,\n", | |
" 'contains_linear_sugars': False,\n", | |
" 'collection': [],\n", | |
" 'molecular_formula': 'C25H20O6',\n", | |
" 'molecular_weight': 416.42364206163205,\n", | |
" 'geoLocation': ['nogeo'],\n", | |
" 'npl_noh_score': 1.2130963257944447,\n", | |
" 'npl_score': 0.8972662491004283,\n", | |
" 'npl_sugar_score': 0.9320314771692526,\n", | |
" 'number_of_carbons': 25,\n", | |
" 'number_of_nitrogens': 0,\n", | |
" 'number_of_oxygens': 6,\n", | |
" 'max_number_of_rings': 5,\n", | |
" 'min_number_of_rings': 4,\n", | |
" 'sugar_free_heavy_atom_number': 31,\n", | |
" 'sugar_free_total_atom_number': 51,\n", | |
" 'total_atom_number': 51,\n", | |
" 'bond_count': 34,\n", | |
" 'found_in_databases': ['ibs2019mar_nc', 'pubchem'],\n", | |
" 'xrefs': [['pubchem_tested_np',\n", | |
" '71827163',\n", | |
" 'https://pubchem.ncbi.nlm.nih.gov/compound/']],\n", | |
" 'fragments': {'[H]([O]([C]))': 2,\n", | |
" '[C](=[C]([C][C])[C](=[C][H])[H])': 2,\n", | |
" '[O]([C]([C]=[C])[C]([C]=[C]))': 1,\n", | |
" '[H]([C]([C][C][C]))': 1,\n", | |
" '[O]([C]([C]=[C])[H])': 2,\n", | |
" '[C]([C]([O]=[O])[C]([C][C][H])[H][H])': 1,\n", | |
" '[C](=[C]([C][H])[C](=[C][H])[H])': 6,\n", | |
" '[C](=[C]([C][O])[C](=[C][O])[H])': 1,\n", | |
" '[C]([H][H][H][O]([C]))': 1,\n", | |
" '[C](=[C]([C][H])[C](=[C][H])[C](=[C][O]))': 1,\n", | |
" '[C](=[C]([C][O])[C](=[C][O])[C]([C][C][H]))': 1,\n", | |
" '[C]([C]([C]=[C])[C]([C]=[C])[C]([C][H][H])[H])': 1,\n", | |
" '[O](=[C]([C][O]))': 1,\n", | |
" '[C]([C]([C][H][H])=[O][O]([C]))': 1,\n", | |
" '[C]([C]([C]=[C])=[C]([C][H])[O]([H]))': 1,\n", | |
" '[H]([C]([H][H][O]))': 3,\n", | |
" '[C](=[C]([C][O])[C]([C]=[O])[H])': 1,\n", | |
" '[C](=[C]([C][H])[C](=[C][H])[C]([C][C][H]))': 1,\n", | |
" '[C](=[C]([C][O])[C]([C]=[O])[C](=[C][O]))': 1,\n", | |
" '[O](=[C]([C][C]))': 1,\n", | |
" '[H]([C]([C]=[C]))': 12,\n", | |
" '[C]([C]([C]=[C])[C](=[C][H])=[O])': 1,\n", | |
" '[C](=[C]([C][C])[C](=[C][H])[O]([H]))': 1,\n", | |
" '[C]([C]([C]=[C])=[C]([C][H])[O]([C]))': 1,\n", | |
" '[H]([C]([C][C][H]))': 2,\n", | |
" '[C]([C]([C]=[C])=[C]([C][H])[H])': 2,\n", | |
" '[O]([C]([C]=[O])[C]([H][H][H]))': 1,\n", | |
" '[C](=[C]([C][C])[C]([C]=[C])[O]([C]))': 1},\n", | |
" 'fragmentsWithSugar': {'[H]([O]([C]))': 2,\n", | |
" '[C](=[C]([C][C])[C](=[C][H])[H])': 2,\n", | |
" '[O]([C]([C]=[C])[C]([C]=[C]))': 1,\n", | |
" '[H]([C]([C][C][C]))': 1,\n", | |
" '[O]([C]([C]=[C])[H])': 2,\n", | |
" '[C]([C]([O]=[O])[C]([C][C][H])[H][H])': 1,\n", | |
" '[C](=[C]([C][H])[C](=[C][H])[H])': 6,\n", | |
" '[C](=[C]([C][O])[C](=[C][O])[H])': 1,\n", | |
" '[C]([H][H][H][O]([C]))': 1,\n", | |
" '[C](=[C]([C][H])[C](=[C][H])[C](=[C][O]))': 1,\n", | |
" '[C](=[C]([C][O])[C](=[C][O])[C]([C][C][H]))': 1,\n", | |
" '[C]([C]([C]=[C])[C]([C]=[C])[C]([C][H][H])[H])': 1,\n", | |
" '[O](=[C]([C][O]))': 1,\n", | |
" '[C]([C]([C][H][H])=[O][O]([C]))': 1,\n", | |
" '[C]([C]([C]=[C])=[C]([C][H])[O]([H]))': 1,\n", | |
" '[H]([C]([H][H][O]))': 3,\n", | |
" '[C](=[C]([C][O])[C]([C]=[O])[H])': 1,\n", | |
" '[C](=[C]([C][H])[C](=[C][H])[C]([C][C][H]))': 1,\n", | |
" '[C](=[C]([C][O])[C]([C]=[O])[C](=[C][O]))': 1,\n", | |
" '[O](=[C]([C][C]))': 1,\n", | |
" '[H]([C]([C]=[C]))': 12,\n", | |
" '[C]([C]([C]=[C])[C](=[C][H])=[O])': 1,\n", | |
" '[C](=[C]([C][C])[C](=[C][H])[O]([H]))': 1,\n", | |
" '[C]([C]([C]=[C])=[C]([C][H])[O]([C]))': 1,\n", | |
" '[H]([C]([C][C][H]))': 2,\n", | |
" '[C]([C]([C]=[C])=[C]([C][H])[H])': 2,\n", | |
" '[O]([C]([C]=[O])[C]([H][H][H]))': 1,\n", | |
" '[C](=[C]([C][C])[C]([C]=[C])[O]([C]))': 1},\n", | |
" 'murko_framework': 'O1C(=CCc2cccc(c12)Cc3ccccc3)c4ccccc4',\n", | |
" 'ertlFunctionalFragments': {'*o*': 1,\n", | |
" '*OC(*)=O': 1,\n", | |
" '[H]O[c]': 2,\n", | |
" '[c]=O': 1},\n", | |
" 'ertlFunctionalFragmentsPseudoSmiles': {'[C*]=O': 1,\n", | |
" 'RO*R': 1,\n", | |
" '[H]O[C*]': 2,\n", | |
" 'ROC(R)=O': 1},\n", | |
" 'pubchemFingerprint': [9,\n", | |
" 10,\n", | |
" 11,\n", | |
" 12,\n", | |
" 18,\n", | |
" 19,\n", | |
" 20,\n", | |
" 178,\n", | |
" 179,\n", | |
" 181,\n", | |
" 185,\n", | |
" 186,\n", | |
" 192,\n", | |
" 193,\n", | |
" 199,\n", | |
" 255,\n", | |
" 256,\n", | |
" 257,\n", | |
" 259,\n", | |
" 261,\n", | |
" 283,\n", | |
" 284,\n", | |
" 286,\n", | |
" 308,\n", | |
" 332,\n", | |
" 333,\n", | |
" 335,\n", | |
" 341,\n", | |
" 344,\n", | |
" 352,\n", | |
" 355,\n", | |
" 356,\n", | |
" 366,\n", | |
" 370,\n", | |
" 371,\n", | |
" 374,\n", | |
" 380,\n", | |
" 381,\n", | |
" 382,\n", | |
" 384,\n", | |
" 385,\n", | |
" 405,\n", | |
" 406,\n", | |
" 409,\n", | |
" 416,\n", | |
" 420,\n", | |
" 430,\n", | |
" 432,\n", | |
" 434,\n", | |
" 440,\n", | |
" 441,\n", | |
" 443,\n", | |
" 446,\n", | |
" 452,\n", | |
" 470,\n", | |
" 476,\n", | |
" 490,\n", | |
" 493,\n", | |
" 498,\n", | |
" 516,\n", | |
" 520,\n", | |
" 524,\n", | |
" 535,\n", | |
" 541,\n", | |
" 542,\n", | |
" 548,\n", | |
" 552,\n", | |
" 553,\n", | |
" 556,\n", | |
" 564,\n", | |
" 565,\n", | |
" 570,\n", | |
" 573,\n", | |
" 574,\n", | |
" 575,\n", | |
" 578,\n", | |
" 579,\n", | |
" 581,\n", | |
" 582,\n", | |
" 584,\n", | |
" 588,\n", | |
" 589,\n", | |
" 590,\n", | |
" 594,\n", | |
" 595,\n", | |
" 597,\n", | |
" 599,\n", | |
" 603,\n", | |
" 604,\n", | |
" 606,\n", | |
" 608,\n", | |
" 614,\n", | |
" 617,\n", | |
" 618,\n", | |
" 619,\n", | |
" 620,\n", | |
" 623,\n", | |
" 625,\n", | |
" 626,\n", | |
" 632,\n", | |
" 634,\n", | |
" 637,\n", | |
" 639,\n", | |
" 640,\n", | |
" 641,\n", | |
" 642,\n", | |
" 651,\n", | |
" 653,\n", | |
" 655,\n", | |
" 660,\n", | |
" 664,\n", | |
" 666,\n", | |
" 667,\n", | |
" 668,\n", | |
" 671,\n", | |
" 672,\n", | |
" 677,\n", | |
" 678,\n", | |
" 679,\n", | |
" 680,\n", | |
" 684,\n", | |
" 688,\n", | |
" 689,\n", | |
" 690,\n", | |
" 692,\n", | |
" 693,\n", | |
" 694,\n", | |
" 696,\n", | |
" 697,\n", | |
" 698,\n", | |
" 699,\n", | |
" 700,\n", | |
" 701,\n", | |
" 704,\n", | |
" 705,\n", | |
" 706,\n", | |
" 708,\n", | |
" 709,\n", | |
" 710,\n", | |
" 712,\n", | |
" 714,\n", | |
" 734,\n", | |
" 740,\n", | |
" 756,\n", | |
" 777,\n", | |
" 797,\n", | |
" 803,\n", | |
" 819],\n", | |
" 'pfCounts': {'count': 148,\n", | |
" 'bits': [9,\n", | |
" 10,\n", | |
" 11,\n", | |
" 12,\n", | |
" 18,\n", | |
" 19,\n", | |
" 20,\n", | |
" 178,\n", | |
" 179,\n", | |
" 181,\n", | |
" 185,\n", | |
" 186,\n", | |
" 192,\n", | |
" 193,\n", | |
" 199,\n", | |
" 255,\n", | |
" 256,\n", | |
" 257,\n", | |
" 259,\n", | |
" 261,\n", | |
" 283,\n", | |
" 284,\n", | |
" 286,\n", | |
" 308,\n", | |
" 332,\n", | |
" 333,\n", | |
" 335,\n", | |
" 341,\n", | |
" 344,\n", | |
" 352,\n", | |
" 355,\n", | |
" 356,\n", | |
" 366,\n", | |
" 370,\n", | |
" 371,\n", | |
" 374,\n", | |
" 380,\n", | |
" 381,\n", | |
" 382,\n", | |
" 384,\n", | |
" 385,\n", | |
" 405,\n", | |
" 406,\n", | |
" 409,\n", | |
" 416,\n", | |
" 420,\n", | |
" 430,\n", | |
" 432,\n", | |
" 434,\n", | |
" 440,\n", | |
" 441,\n", | |
" 443,\n", | |
" 446,\n", | |
" 452,\n", | |
" 470,\n", | |
" 476,\n", | |
" 490,\n", | |
" 493,\n", | |
" 498,\n", | |
" 516,\n", | |
" 520,\n", | |
" 524,\n", | |
" 535,\n", | |
" 541,\n", | |
" 542,\n", | |
" 548,\n", | |
" 552,\n", | |
" 553,\n", | |
" 556,\n", | |
" 564,\n", | |
" 565,\n", | |
" 570,\n", | |
" 573,\n", | |
" 574,\n", | |
" 575,\n", | |
" 578,\n", | |
" 579,\n", | |
" 581,\n", | |
" 582,\n", | |
" 584,\n", | |
" 588,\n", | |
" 589,\n", | |
" 590,\n", | |
" 594,\n", | |
" 595,\n", | |
" 597,\n", | |
" 599,\n", | |
" 603,\n", | |
" 604,\n", | |
" 606,\n", | |
" 608,\n", | |
" 614,\n", | |
" 617,\n", | |
" 618,\n", | |
" 619,\n", | |
" 620,\n", | |
" 623,\n", | |
" 625,\n", | |
" 626,\n", | |
" 632,\n", | |
" 634,\n", | |
" 637,\n", | |
" 639,\n", | |
" 640,\n", | |
" 641,\n", | |
" 642,\n", | |
" 651,\n", | |
" 653,\n", | |
" 655,\n", | |
" 660,\n", | |
" 664,\n", | |
" 666,\n", | |
" 667,\n", | |
" 668,\n", | |
" 671,\n", | |
" 672,\n", | |
" 677,\n", | |
" 678,\n", | |
" 679,\n", | |
" 680,\n", | |
" 684,\n", | |
" 688,\n", | |
" 689,\n", | |
" 690,\n", | |
" 692,\n", | |
" 693,\n", | |
" 694,\n", | |
" 696,\n", | |
" 697,\n", | |
" 698,\n", | |
" 699,\n", | |
" 700,\n", | |
" 701,\n", | |
" 704,\n", | |
" 705,\n", | |
" 706,\n", | |
" 708,\n", | |
" 709,\n", | |
" 710,\n", | |
" 712,\n", | |
" 714,\n", | |
" 734,\n", | |
" 740,\n", | |
" 756,\n", | |
" 777,\n", | |
" 797,\n", | |
" 803,\n", | |
" 819]},\n", | |
" 'circularFingerprint': [19,\n", | |
" 41,\n", | |
" 43,\n", | |
" 76,\n", | |
" 95,\n", | |
" 133,\n", | |
" 142,\n", | |
" 152,\n", | |
" 166,\n", | |
" 206,\n", | |
" 222,\n", | |
" 244,\n", | |
" 250,\n", | |
" 255,\n", | |
" 299,\n", | |
" 334,\n", | |
" 346,\n", | |
" 352,\n", | |
" 354,\n", | |
" 378,\n", | |
" 420,\n", | |
" 445,\n", | |
" 452,\n", | |
" 460,\n", | |
" 486,\n", | |
" 494,\n", | |
" 507,\n", | |
" 510,\n", | |
" 512,\n", | |
" 514,\n", | |
" 548,\n", | |
" 549,\n", | |
" 569,\n", | |
" 572,\n", | |
" 585,\n", | |
" 618,\n", | |
" 625,\n", | |
" 639,\n", | |
" 646,\n", | |
" 660,\n", | |
" 682,\n", | |
" 726,\n", | |
" 747,\n", | |
" 822,\n", | |
" 898,\n", | |
" 947,\n", | |
" 967,\n", | |
" 986],\n", | |
" 'extendedFingerprint': [0,\n", | |
" 4,\n", | |
" 9,\n", | |
" 11,\n", | |
" 13,\n", | |
" 24,\n", | |
" 28,\n", | |
" 40,\n", | |
" 41,\n", | |
" 52,\n", | |
" 53,\n", | |
" 61,\n", | |
" 64,\n", | |
" 65,\n", | |
" 68,\n", | |
" 74,\n", | |
" 75,\n", | |
" 78,\n", | |
" 83,\n", | |
" 85,\n", | |
" 88,\n", | |
" 92,\n", | |
" 96,\n", | |
" 105,\n", | |
" 106,\n", | |
" 110,\n", | |
" 112,\n", | |
" 115,\n", | |
" 118,\n", | |
" 122,\n", | |
" 123,\n", | |
" 124,\n", | |
" 125,\n", | |
" 133,\n", | |
" 136,\n", | |
" 140,\n", | |
" 142,\n", | |
" 146,\n", | |
" 150,\n", | |
" 152,\n", | |
" 159,\n", | |
" 161,\n", | |
" 163,\n", | |
" 164,\n", | |
" 166,\n", | |
" 167,\n", | |
" 170,\n", | |
" 173,\n", | |
" 175,\n", | |
" 181,\n", | |
" 183,\n", | |
" 187,\n", | |
" 191,\n", | |
" 192,\n", | |
" 197,\n", | |
" 212,\n", | |
" 231,\n", | |
" 233,\n", | |
" 235,\n", | |
" 236,\n", | |
" 240,\n", | |
" 243,\n", | |
" 246,\n", | |
" 249,\n", | |
" 250,\n", | |
" 251,\n", | |
" 252,\n", | |
" 259,\n", | |
" 266,\n", | |
" 271,\n", | |
" 280,\n", | |
" 282,\n", | |
" 286,\n", | |
" 288,\n", | |
" 291,\n", | |
" 298,\n", | |
" 299,\n", | |
" 301,\n", | |
" 304,\n", | |
" 313,\n", | |
" 319,\n", | |
" 320,\n", | |
" 325,\n", | |
" 330,\n", | |
" 337,\n", | |
" 344,\n", | |
" 350,\n", | |
" 351,\n", | |
" 358,\n", | |
" 374,\n", | |
" 377,\n", | |
" 386,\n", | |
" 388,\n", | |
" 398,\n", | |
" 399,\n", | |
" 402,\n", | |
" 404,\n", | |
" 414,\n", | |
" 416,\n", | |
" 423,\n", | |
" 440,\n", | |
" 446,\n", | |
" 450,\n", | |
" 453,\n", | |
" 455,\n", | |
" 467,\n", | |
" 476,\n", | |
" 481,\n", | |
" 489,\n", | |
" 494,\n", | |
" 507,\n", | |
" 541,\n", | |
" 543,\n", | |
" 544,\n", | |
" 547,\n", | |
" 549,\n", | |
" 550,\n", | |
" 553,\n", | |
" 557,\n", | |
" 573,\n", | |
" 574,\n", | |
" 578,\n", | |
" 579,\n", | |
" 596,\n", | |
" 597,\n", | |
" 605,\n", | |
" 606,\n", | |
" 610,\n", | |
" 611,\n", | |
" 612,\n", | |
" 618,\n", | |
" 627,\n", | |
" 638,\n", | |
" 644,\n", | |
" 650,\n", | |
" 651,\n", | |
" 656,\n", | |
" 660,\n", | |
" 661,\n", | |
" 666,\n", | |
" 676,\n", | |
" 678,\n", | |
" 688,\n", | |
" 703,\n", | |
" 711,\n", | |
" 712,\n", | |
" 718,\n", | |
" 723,\n", | |
" 730,\n", | |
" 741,\n", | |
" 745,\n", | |
" 747,\n", | |
" 749,\n", | |
" 751,\n", | |
" 752,\n", | |
" 759,\n", | |
" 761,\n", | |
" 765,\n", | |
" 781,\n", | |
" 785,\n", | |
" 786,\n", | |
" 793,\n", | |
" 796,\n", | |
" 810,\n", | |
" 812,\n", | |
" 815,\n", | |
" 818,\n", | |
" 819,\n", | |
" 826,\n", | |
" 833,\n", | |
" 840,\n", | |
" 842,\n", | |
" 846,\n", | |
" 848,\n", | |
" 852,\n", | |
" 854,\n", | |
" 857,\n", | |
" 868,\n", | |
" 869,\n", | |
" 901,\n", | |
" 907,\n", | |
" 926,\n", | |
" 930,\n", | |
" 938,\n", | |
" 939,\n", | |
" 940,\n", | |
" 949,\n", | |
" 952,\n", | |
" 967,\n", | |
" 971,\n", | |
" 976,\n", | |
" 989,\n", | |
" 995,\n", | |
" 1009,\n", | |
" 1010,\n", | |
" 1011,\n", | |
" 1012,\n", | |
" 1013,\n", | |
" 1014],\n", | |
" 'pubchemBits': b'\\x00\\x1e\\x1c\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00,\\x06\\x83\\x00\\x00\\x00\\x00\\x00\\x00\\x80+\\x00\\x00X\\x00\\x00\\x10\\x00\\x00\\xb0 \\x01\\x19@Lp\\x03\\x00`\\x02\\x11@\\x05K\\x10\\x00@\\x10\\x00$\\x04\\x00\\x10\\x11\\x80`\\x10\\x130\\xe4lq\\xacXA\\x9e\\x06\\xa5\\x07\\xa8\\x10\\x9d\\xe1\\x11w?w\\x05\\x00@\\x10\\x00\\x10\\x00\\x00\\x02\\x00 \\x08\\x00\\x08',\n", | |
" 'pubchemBitsString': '00000000011110000011100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001101000110000011000001000000000000000000000000000000000000000000000000000000011101010000000000000000000001101000000000000000000000100000000000000000000000110100000100100000001001100000000010001100100000111011000000000000000000011001000000100010000000001010100000110100100000100000000000000000100000100000000000001001000010000000000000000010001000100000000001000001100000100011001000000011000010011100110110100011100011010100011010100000100111100101100000101001011110000000010101000010001011100110000111100010001110111011111100111011101010000000000000000000100000100000000000000010000000000000000000010000000000000000000100000100000000000000010',\n", | |
" 'citationDOI': [],\n", | |
" 'taxid': [],\n", | |
" 'textTaxa': ['notax'],\n", | |
" 'chemicalSuperClass': 'Phenylpropanoids and polyketides',\n", | |
" 'chemicalClass': 'Diarylheptanoids',\n", | |
" 'chemicalSubClass': 'Linear diarylheptanoids',\n", | |
" 'directParentClassification': 'Linear diarylheptanoids',\n", | |
" 'allChemClassifications': [],\n", | |
" 'taxonomyReferenceObjects': [],\n", | |
" 'allTaxa': [],\n", | |
" 'absolute_smiles_sources': {'nostereo': ['ibs2019mar_nc']},\n", | |
" 'absolute_smiles': {},\n", | |
" 'allWikidataIds': [],\n", | |
" 'alogp': 1.5698999999999994,\n", | |
" 'alogp2': 2.4645860099999983,\n", | |
" 'amralogp': 127.32220000000001,\n", | |
" 'apol': 62.147859999999966,\n", | |
" 'bcutDescriptor': [11.850000000000007,\n", | |
" 15.999952464292491,\n", | |
" -0.36071906962798755,\n", | |
" 0.2622629421493053,\n", | |
" 4.61844183790179,\n", | |
" 12.027281211337542],\n", | |
" 'bpol': 27.612139999999993,\n", | |
" 'eccentricConnectivityIndexDescriptor': 620,\n", | |
" 'fmfDescriptor': 0.7419354838709677,\n", | |
" 'fsp3': 0.12,\n", | |
" 'fragmentComplexityDescriptor': 1986.06,\n", | |
" 'gravitationalIndexHeavyAtoms': nan,\n", | |
" 'hBondAcceptorCount': 6,\n", | |
" 'hBondDonorCount': 2,\n", | |
" 'hybridizationRatioDescriptor': 0.12,\n", | |
" 'kappaShapeIndex1': 24.134948096885815,\n", | |
" 'kappaShapeIndex2': 10.950520833333334,\n", | |
" 'kappaShapeIndex3': 5.3994490358126725,\n", | |
" 'manholdlogp': 3.55,\n", | |
" 'petitjeanNumber': 0.5,\n", | |
" 'petitjeanShapeTopo': 1.0,\n", | |
" 'petitjeanShapeGeom': nan,\n", | |
" 'lipinskiRuleOf5Failures': 0,\n", | |
" 'numberSpiroAtoms': 0,\n", | |
" 'vabcDescriptor': nan,\n", | |
" 'vertexAdjMagnitude': 6.08746284125034,\n", | |
" 'weinerPathNumber': 2424.0,\n", | |
" 'weinerPolarityNumber': 54.0,\n", | |
" 'xlogp': 3.229,\n", | |
" 'zagrebIndex': 164.0,\n", | |
" 'topoPSA': 93.06000000000002,\n", | |
" 'tpsaEfficiency': 0.22363419389156305,\n", | |
" '_class': 'de.unijena.cheminf.npopensourcecollector.mongocollections.UniqueNaturalProduct'}" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"db.uniqueNaturalProduct.find({})[1]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c7917972", | |
"metadata": {}, | |
"source": [ | |
"### Natural Product Fingerprints " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "2d5bf4de", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"cursor = db.uniqueNaturalProduct.find({},{'coconut_id':1,'circularFingerprint':1, 'extendedFingerprint':1 ,\n", | |
" 'pubchemFingerprint':1, 'pubchemBits':1, 'pubchemBitsString':1,'_id':0})\n", | |
"list_cur = list(cursor)\n", | |
"df = pd.DataFrame(list_cur)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "00a6e36b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>coconut_id</th>\n", | |
" <th>pubchemFingerprint</th>\n", | |
" <th>circularFingerprint</th>\n", | |
" <th>extendedFingerprint</th>\n", | |
" <th>pubchemBits</th>\n", | |
" <th>pubchemBitsString</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>CNP0220816</td>\n", | |
" <td>[9, 10, 11, 12, 18, 19, 20, 21, 143, 147, 178,...</td>\n", | |
" <td>[14, 88, 133, 152, 169, 219, 222, 236, 244, 31...</td>\n", | |
" <td>[8, 9, 13, 14, 18, 28, 32, 37, 41, 53, 58, 61,...</td>\n", | |
" <td>b'\\x00\\x1e<\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x0...</td>\n", | |
" <td>0000000001111000001111000000000000000000000000...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>CNP0293916</td>\n", | |
" <td>[9, 10, 11, 12, 18, 19, 20, 178, 179, 181, 185...</td>\n", | |
" <td>[19, 41, 43, 76, 95, 133, 142, 152, 166, 206, ...</td>\n", | |
" <td>[0, 4, 9, 11, 13, 24, 28, 40, 41, 52, 53, 61, ...</td>\n", | |
" <td>b'\\x00\\x1e\\x1c\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00...</td>\n", | |
" <td>0000000001111000001110000000000000000000000000...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>CNP0297651</td>\n", | |
" <td>[9, 10, 11, 12, 14, 18, 19, 20, 21, 178, 181, ...</td>\n", | |
" <td>[11, 14, 19, 25, 34, 57, 70, 82, 88, 133, 152,...</td>\n", | |
" <td>[8, 14, 18, 26, 32, 37, 53, 55, 62, 65, 68, 78...</td>\n", | |
" <td>b'\\x00^<\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x...</td>\n", | |
" <td>0000000001111010001111000000000000000000000000...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>CNP0330764</td>\n", | |
" <td>[9, 10, 11, 12, 18, 19, 20, 143, 144, 178, 179...</td>\n", | |
" <td>[10, 14, 19, 25, 57, 88, 93, 133, 192, 220, 22...</td>\n", | |
" <td>[8, 14, 58, 65, 68, 74, 82, 87, 92, 117, 118, ...</td>\n", | |
" <td>b'\\x00\\x1e\\x1c\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00...</td>\n", | |
" <td>0000000001111000001110000000000000000000000000...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>CNP0125332</td>\n", | |
" <td>[9, 10, 18, 19, 20, 21, 30, 31, 143, 146, 283,...</td>\n", | |
" <td>[23, 53, 56, 88, 152, 169, 265, 338, 346, 368,...</td>\n", | |
" <td>[5, 8, 18, 20, 22, 25, 36, 65, 85, 115, 118, 1...</td>\n", | |
" <td>b'\\x00\\x06<\\xc0\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x0...</td>\n", | |
" <td>0000000001100000001111000000001100000000000000...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" coconut_id pubchemFingerprint \\\n", | |
"0 CNP0220816 [9, 10, 11, 12, 18, 19, 20, 21, 143, 147, 178,... \n", | |
"1 CNP0293916 [9, 10, 11, 12, 18, 19, 20, 178, 179, 181, 185... \n", | |
"2 CNP0297651 [9, 10, 11, 12, 14, 18, 19, 20, 21, 178, 181, ... \n", | |
"3 CNP0330764 [9, 10, 11, 12, 18, 19, 20, 143, 144, 178, 179... \n", | |
"4 CNP0125332 [9, 10, 18, 19, 20, 21, 30, 31, 143, 146, 283,... \n", | |
"\n", | |
" circularFingerprint \\\n", | |
"0 [14, 88, 133, 152, 169, 219, 222, 236, 244, 31... \n", | |
"1 [19, 41, 43, 76, 95, 133, 142, 152, 166, 206, ... \n", | |
"2 [11, 14, 19, 25, 34, 57, 70, 82, 88, 133, 152,... \n", | |
"3 [10, 14, 19, 25, 57, 88, 93, 133, 192, 220, 22... \n", | |
"4 [23, 53, 56, 88, 152, 169, 265, 338, 346, 368,... \n", | |
"\n", | |
" extendedFingerprint \\\n", | |
"0 [8, 9, 13, 14, 18, 28, 32, 37, 41, 53, 58, 61,... \n", | |
"1 [0, 4, 9, 11, 13, 24, 28, 40, 41, 52, 53, 61, ... \n", | |
"2 [8, 14, 18, 26, 32, 37, 53, 55, 62, 65, 68, 78... \n", | |
"3 [8, 14, 58, 65, 68, 74, 82, 87, 92, 117, 118, ... \n", | |
"4 [5, 8, 18, 20, 22, 25, 36, 65, 85, 115, 118, 1... \n", | |
"\n", | |
" pubchemBits \\\n", | |
"0 b'\\x00\\x1e<\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x0... \n", | |
"1 b'\\x00\\x1e\\x1c\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00... \n", | |
"2 b'\\x00^<\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x... \n", | |
"3 b'\\x00\\x1e\\x1c\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00... \n", | |
"4 b'\\x00\\x06<\\xc0\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x0... \n", | |
"\n", | |
" pubchemBitsString \n", | |
"0 0000000001111000001111000000000000000000000000... \n", | |
"1 0000000001111000001110000000000000000000000000... \n", | |
"2 0000000001111010001111000000000000000000000000... \n", | |
"3 0000000001111000001110000000000000000000000000... \n", | |
"4 0000000001100000001111000000001100000000000000... " | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "0c423866", | |
"metadata": {}, | |
"source": [ | |
"### Download Fingerprints as CSV" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "7de3a635", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#run this to download all fingerprints as csv\n", | |
"df.to_csv('coconut_fingerprints.csv',index=False)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment