Skip to content

Instantly share code, notes, and snippets.

@adelenelai
Last active April 20, 2022 04:37
Show Gist options
  • Save adelenelai/2b5d23dcf2c40f157075a97e57fcb320 to your computer and use it in GitHub Desktop.
Save adelenelai/2b5d23dcf2c40f157075a97e57fcb320 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "f214cd96",
"metadata": {},
"source": [
"## Gist 2 of 3 - Fingerprints\n",
"\n",
"\n",
"https://adelenel.ai/sugarfreecoconut\n",
"\n",
"20/04/2022"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "45895cf6",
"metadata": {},
"outputs": [],
"source": [
"from pymongo import MongoClient\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e82a3ecc",
"metadata": {},
"outputs": [],
"source": [
"client = MongoClient('localhost',27017)\n",
"db = client.COCONUT_2021_11"
]
},
{
"cell_type": "markdown",
"id": "e098cc23",
"metadata": {},
"source": [
"### Example data from one document"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "fba9b6de",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'_id': ObjectId('61a4de12c52bda1e67b75964'),\n",
" 'coconut_id': 'CNP0293916',\n",
" 'contains_sugar': 0,\n",
" 'heavy_atom_number': 31,\n",
" 'inchi': 'InChI=1S/C25H20O6/c1-30-22(29)12-17(15-8-4-2-5-9-15)23-18(26)13-19(27)24-20(28)14-21(31-25(23)24)16-10-6-3-7-11-16/h2-11,13-14,17,26-27H,12H2,1H3',\n",
" 'inchikey': 'AAAAWQOPBUPWEV-UHFFFAOYSA-N',\n",
" 'smiles': '[H]OC1=C2C(OC(C=3C([H])=C([H])C([H])=C([H])C3[H])=C([H])C2=O)=C(C(O[H])=C1[H])C([H])(C=4C([H])=C([H])C([H])=C([H])C4[H])C([H])([H])C(=O)OC([H])([H])[H]',\n",
" 'unique_smiles': 'O=C1C=C(OC=2C1=C(O)C=C(O)C2C(C=3C=CC=CC3)CC(=O)OC)C=4C=CC=CC4',\n",
" 'clean_smiles': 'O=C1C=C(OC=2C1=C(O)C=C(O)C2C(C=3C=CC=CC3)CC(=O)OC)C=4C=CC=CC4',\n",
" 'sugar_free_smiles': 'O=C1C=C(OC=2C1=C(O)C=C(O)C2C(C=3C=CC=CC3)CC(=O)OC)C=4C=CC=CC4',\n",
" 'deep_smiles': '',\n",
" 'name': 'methyl 3-(5,7-dihydroxy-4-oxo-2-phenyl-4H-chromen-8-yl)-3-phenylpropanoate',\n",
" 'nameTrustLevel': 0,\n",
" 'annotationLevel': 1,\n",
" 'synonyms': [],\n",
" 'cas': '1574471-61-6',\n",
" 'iupac_name': 'methyl 3-(5,7-dihydroxy-4-oxo-2-phenyl-4H-chromen-8-yl)-3-phenylpropanoate',\n",
" 'contains_ring_sugars': False,\n",
" 'contains_linear_sugars': False,\n",
" 'collection': [],\n",
" 'molecular_formula': 'C25H20O6',\n",
" 'molecular_weight': 416.42364206163205,\n",
" 'geoLocation': ['nogeo'],\n",
" 'npl_noh_score': 1.2130963257944447,\n",
" 'npl_score': 0.8972662491004283,\n",
" 'npl_sugar_score': 0.9320314771692526,\n",
" 'number_of_carbons': 25,\n",
" 'number_of_nitrogens': 0,\n",
" 'number_of_oxygens': 6,\n",
" 'max_number_of_rings': 5,\n",
" 'min_number_of_rings': 4,\n",
" 'sugar_free_heavy_atom_number': 31,\n",
" 'sugar_free_total_atom_number': 51,\n",
" 'total_atom_number': 51,\n",
" 'bond_count': 34,\n",
" 'found_in_databases': ['ibs2019mar_nc', 'pubchem'],\n",
" 'xrefs': [['pubchem_tested_np',\n",
" '71827163',\n",
" 'https://pubchem.ncbi.nlm.nih.gov/compound/']],\n",
" 'fragments': {'[H]([O]([C]))': 2,\n",
" '[C](=[C]([C][C])[C](=[C][H])[H])': 2,\n",
" '[O]([C]([C]=[C])[C]([C]=[C]))': 1,\n",
" '[H]([C]([C][C][C]))': 1,\n",
" '[O]([C]([C]=[C])[H])': 2,\n",
" '[C]([C]([O]=[O])[C]([C][C][H])[H][H])': 1,\n",
" '[C](=[C]([C][H])[C](=[C][H])[H])': 6,\n",
" '[C](=[C]([C][O])[C](=[C][O])[H])': 1,\n",
" '[C]([H][H][H][O]([C]))': 1,\n",
" '[C](=[C]([C][H])[C](=[C][H])[C](=[C][O]))': 1,\n",
" '[C](=[C]([C][O])[C](=[C][O])[C]([C][C][H]))': 1,\n",
" '[C]([C]([C]=[C])[C]([C]=[C])[C]([C][H][H])[H])': 1,\n",
" '[O](=[C]([C][O]))': 1,\n",
" '[C]([C]([C][H][H])=[O][O]([C]))': 1,\n",
" '[C]([C]([C]=[C])=[C]([C][H])[O]([H]))': 1,\n",
" '[H]([C]([H][H][O]))': 3,\n",
" '[C](=[C]([C][O])[C]([C]=[O])[H])': 1,\n",
" '[C](=[C]([C][H])[C](=[C][H])[C]([C][C][H]))': 1,\n",
" '[C](=[C]([C][O])[C]([C]=[O])[C](=[C][O]))': 1,\n",
" '[O](=[C]([C][C]))': 1,\n",
" '[H]([C]([C]=[C]))': 12,\n",
" '[C]([C]([C]=[C])[C](=[C][H])=[O])': 1,\n",
" '[C](=[C]([C][C])[C](=[C][H])[O]([H]))': 1,\n",
" '[C]([C]([C]=[C])=[C]([C][H])[O]([C]))': 1,\n",
" '[H]([C]([C][C][H]))': 2,\n",
" '[C]([C]([C]=[C])=[C]([C][H])[H])': 2,\n",
" '[O]([C]([C]=[O])[C]([H][H][H]))': 1,\n",
" '[C](=[C]([C][C])[C]([C]=[C])[O]([C]))': 1},\n",
" 'fragmentsWithSugar': {'[H]([O]([C]))': 2,\n",
" '[C](=[C]([C][C])[C](=[C][H])[H])': 2,\n",
" '[O]([C]([C]=[C])[C]([C]=[C]))': 1,\n",
" '[H]([C]([C][C][C]))': 1,\n",
" '[O]([C]([C]=[C])[H])': 2,\n",
" '[C]([C]([O]=[O])[C]([C][C][H])[H][H])': 1,\n",
" '[C](=[C]([C][H])[C](=[C][H])[H])': 6,\n",
" '[C](=[C]([C][O])[C](=[C][O])[H])': 1,\n",
" '[C]([H][H][H][O]([C]))': 1,\n",
" '[C](=[C]([C][H])[C](=[C][H])[C](=[C][O]))': 1,\n",
" '[C](=[C]([C][O])[C](=[C][O])[C]([C][C][H]))': 1,\n",
" '[C]([C]([C]=[C])[C]([C]=[C])[C]([C][H][H])[H])': 1,\n",
" '[O](=[C]([C][O]))': 1,\n",
" '[C]([C]([C][H][H])=[O][O]([C]))': 1,\n",
" '[C]([C]([C]=[C])=[C]([C][H])[O]([H]))': 1,\n",
" '[H]([C]([H][H][O]))': 3,\n",
" '[C](=[C]([C][O])[C]([C]=[O])[H])': 1,\n",
" '[C](=[C]([C][H])[C](=[C][H])[C]([C][C][H]))': 1,\n",
" '[C](=[C]([C][O])[C]([C]=[O])[C](=[C][O]))': 1,\n",
" '[O](=[C]([C][C]))': 1,\n",
" '[H]([C]([C]=[C]))': 12,\n",
" '[C]([C]([C]=[C])[C](=[C][H])=[O])': 1,\n",
" '[C](=[C]([C][C])[C](=[C][H])[O]([H]))': 1,\n",
" '[C]([C]([C]=[C])=[C]([C][H])[O]([C]))': 1,\n",
" '[H]([C]([C][C][H]))': 2,\n",
" '[C]([C]([C]=[C])=[C]([C][H])[H])': 2,\n",
" '[O]([C]([C]=[O])[C]([H][H][H]))': 1,\n",
" '[C](=[C]([C][C])[C]([C]=[C])[O]([C]))': 1},\n",
" 'murko_framework': 'O1C(=CCc2cccc(c12)Cc3ccccc3)c4ccccc4',\n",
" 'ertlFunctionalFragments': {'*o*': 1,\n",
" '*OC(*)=O': 1,\n",
" '[H]O[c]': 2,\n",
" '[c]=O': 1},\n",
" 'ertlFunctionalFragmentsPseudoSmiles': {'[C*]=O': 1,\n",
" 'RO*R': 1,\n",
" '[H]O[C*]': 2,\n",
" 'ROC(R)=O': 1},\n",
" 'pubchemFingerprint': [9,\n",
" 10,\n",
" 11,\n",
" 12,\n",
" 18,\n",
" 19,\n",
" 20,\n",
" 178,\n",
" 179,\n",
" 181,\n",
" 185,\n",
" 186,\n",
" 192,\n",
" 193,\n",
" 199,\n",
" 255,\n",
" 256,\n",
" 257,\n",
" 259,\n",
" 261,\n",
" 283,\n",
" 284,\n",
" 286,\n",
" 308,\n",
" 332,\n",
" 333,\n",
" 335,\n",
" 341,\n",
" 344,\n",
" 352,\n",
" 355,\n",
" 356,\n",
" 366,\n",
" 370,\n",
" 371,\n",
" 374,\n",
" 380,\n",
" 381,\n",
" 382,\n",
" 384,\n",
" 385,\n",
" 405,\n",
" 406,\n",
" 409,\n",
" 416,\n",
" 420,\n",
" 430,\n",
" 432,\n",
" 434,\n",
" 440,\n",
" 441,\n",
" 443,\n",
" 446,\n",
" 452,\n",
" 470,\n",
" 476,\n",
" 490,\n",
" 493,\n",
" 498,\n",
" 516,\n",
" 520,\n",
" 524,\n",
" 535,\n",
" 541,\n",
" 542,\n",
" 548,\n",
" 552,\n",
" 553,\n",
" 556,\n",
" 564,\n",
" 565,\n",
" 570,\n",
" 573,\n",
" 574,\n",
" 575,\n",
" 578,\n",
" 579,\n",
" 581,\n",
" 582,\n",
" 584,\n",
" 588,\n",
" 589,\n",
" 590,\n",
" 594,\n",
" 595,\n",
" 597,\n",
" 599,\n",
" 603,\n",
" 604,\n",
" 606,\n",
" 608,\n",
" 614,\n",
" 617,\n",
" 618,\n",
" 619,\n",
" 620,\n",
" 623,\n",
" 625,\n",
" 626,\n",
" 632,\n",
" 634,\n",
" 637,\n",
" 639,\n",
" 640,\n",
" 641,\n",
" 642,\n",
" 651,\n",
" 653,\n",
" 655,\n",
" 660,\n",
" 664,\n",
" 666,\n",
" 667,\n",
" 668,\n",
" 671,\n",
" 672,\n",
" 677,\n",
" 678,\n",
" 679,\n",
" 680,\n",
" 684,\n",
" 688,\n",
" 689,\n",
" 690,\n",
" 692,\n",
" 693,\n",
" 694,\n",
" 696,\n",
" 697,\n",
" 698,\n",
" 699,\n",
" 700,\n",
" 701,\n",
" 704,\n",
" 705,\n",
" 706,\n",
" 708,\n",
" 709,\n",
" 710,\n",
" 712,\n",
" 714,\n",
" 734,\n",
" 740,\n",
" 756,\n",
" 777,\n",
" 797,\n",
" 803,\n",
" 819],\n",
" 'pfCounts': {'count': 148,\n",
" 'bits': [9,\n",
" 10,\n",
" 11,\n",
" 12,\n",
" 18,\n",
" 19,\n",
" 20,\n",
" 178,\n",
" 179,\n",
" 181,\n",
" 185,\n",
" 186,\n",
" 192,\n",
" 193,\n",
" 199,\n",
" 255,\n",
" 256,\n",
" 257,\n",
" 259,\n",
" 261,\n",
" 283,\n",
" 284,\n",
" 286,\n",
" 308,\n",
" 332,\n",
" 333,\n",
" 335,\n",
" 341,\n",
" 344,\n",
" 352,\n",
" 355,\n",
" 356,\n",
" 366,\n",
" 370,\n",
" 371,\n",
" 374,\n",
" 380,\n",
" 381,\n",
" 382,\n",
" 384,\n",
" 385,\n",
" 405,\n",
" 406,\n",
" 409,\n",
" 416,\n",
" 420,\n",
" 430,\n",
" 432,\n",
" 434,\n",
" 440,\n",
" 441,\n",
" 443,\n",
" 446,\n",
" 452,\n",
" 470,\n",
" 476,\n",
" 490,\n",
" 493,\n",
" 498,\n",
" 516,\n",
" 520,\n",
" 524,\n",
" 535,\n",
" 541,\n",
" 542,\n",
" 548,\n",
" 552,\n",
" 553,\n",
" 556,\n",
" 564,\n",
" 565,\n",
" 570,\n",
" 573,\n",
" 574,\n",
" 575,\n",
" 578,\n",
" 579,\n",
" 581,\n",
" 582,\n",
" 584,\n",
" 588,\n",
" 589,\n",
" 590,\n",
" 594,\n",
" 595,\n",
" 597,\n",
" 599,\n",
" 603,\n",
" 604,\n",
" 606,\n",
" 608,\n",
" 614,\n",
" 617,\n",
" 618,\n",
" 619,\n",
" 620,\n",
" 623,\n",
" 625,\n",
" 626,\n",
" 632,\n",
" 634,\n",
" 637,\n",
" 639,\n",
" 640,\n",
" 641,\n",
" 642,\n",
" 651,\n",
" 653,\n",
" 655,\n",
" 660,\n",
" 664,\n",
" 666,\n",
" 667,\n",
" 668,\n",
" 671,\n",
" 672,\n",
" 677,\n",
" 678,\n",
" 679,\n",
" 680,\n",
" 684,\n",
" 688,\n",
" 689,\n",
" 690,\n",
" 692,\n",
" 693,\n",
" 694,\n",
" 696,\n",
" 697,\n",
" 698,\n",
" 699,\n",
" 700,\n",
" 701,\n",
" 704,\n",
" 705,\n",
" 706,\n",
" 708,\n",
" 709,\n",
" 710,\n",
" 712,\n",
" 714,\n",
" 734,\n",
" 740,\n",
" 756,\n",
" 777,\n",
" 797,\n",
" 803,\n",
" 819]},\n",
" 'circularFingerprint': [19,\n",
" 41,\n",
" 43,\n",
" 76,\n",
" 95,\n",
" 133,\n",
" 142,\n",
" 152,\n",
" 166,\n",
" 206,\n",
" 222,\n",
" 244,\n",
" 250,\n",
" 255,\n",
" 299,\n",
" 334,\n",
" 346,\n",
" 352,\n",
" 354,\n",
" 378,\n",
" 420,\n",
" 445,\n",
" 452,\n",
" 460,\n",
" 486,\n",
" 494,\n",
" 507,\n",
" 510,\n",
" 512,\n",
" 514,\n",
" 548,\n",
" 549,\n",
" 569,\n",
" 572,\n",
" 585,\n",
" 618,\n",
" 625,\n",
" 639,\n",
" 646,\n",
" 660,\n",
" 682,\n",
" 726,\n",
" 747,\n",
" 822,\n",
" 898,\n",
" 947,\n",
" 967,\n",
" 986],\n",
" 'extendedFingerprint': [0,\n",
" 4,\n",
" 9,\n",
" 11,\n",
" 13,\n",
" 24,\n",
" 28,\n",
" 40,\n",
" 41,\n",
" 52,\n",
" 53,\n",
" 61,\n",
" 64,\n",
" 65,\n",
" 68,\n",
" 74,\n",
" 75,\n",
" 78,\n",
" 83,\n",
" 85,\n",
" 88,\n",
" 92,\n",
" 96,\n",
" 105,\n",
" 106,\n",
" 110,\n",
" 112,\n",
" 115,\n",
" 118,\n",
" 122,\n",
" 123,\n",
" 124,\n",
" 125,\n",
" 133,\n",
" 136,\n",
" 140,\n",
" 142,\n",
" 146,\n",
" 150,\n",
" 152,\n",
" 159,\n",
" 161,\n",
" 163,\n",
" 164,\n",
" 166,\n",
" 167,\n",
" 170,\n",
" 173,\n",
" 175,\n",
" 181,\n",
" 183,\n",
" 187,\n",
" 191,\n",
" 192,\n",
" 197,\n",
" 212,\n",
" 231,\n",
" 233,\n",
" 235,\n",
" 236,\n",
" 240,\n",
" 243,\n",
" 246,\n",
" 249,\n",
" 250,\n",
" 251,\n",
" 252,\n",
" 259,\n",
" 266,\n",
" 271,\n",
" 280,\n",
" 282,\n",
" 286,\n",
" 288,\n",
" 291,\n",
" 298,\n",
" 299,\n",
" 301,\n",
" 304,\n",
" 313,\n",
" 319,\n",
" 320,\n",
" 325,\n",
" 330,\n",
" 337,\n",
" 344,\n",
" 350,\n",
" 351,\n",
" 358,\n",
" 374,\n",
" 377,\n",
" 386,\n",
" 388,\n",
" 398,\n",
" 399,\n",
" 402,\n",
" 404,\n",
" 414,\n",
" 416,\n",
" 423,\n",
" 440,\n",
" 446,\n",
" 450,\n",
" 453,\n",
" 455,\n",
" 467,\n",
" 476,\n",
" 481,\n",
" 489,\n",
" 494,\n",
" 507,\n",
" 541,\n",
" 543,\n",
" 544,\n",
" 547,\n",
" 549,\n",
" 550,\n",
" 553,\n",
" 557,\n",
" 573,\n",
" 574,\n",
" 578,\n",
" 579,\n",
" 596,\n",
" 597,\n",
" 605,\n",
" 606,\n",
" 610,\n",
" 611,\n",
" 612,\n",
" 618,\n",
" 627,\n",
" 638,\n",
" 644,\n",
" 650,\n",
" 651,\n",
" 656,\n",
" 660,\n",
" 661,\n",
" 666,\n",
" 676,\n",
" 678,\n",
" 688,\n",
" 703,\n",
" 711,\n",
" 712,\n",
" 718,\n",
" 723,\n",
" 730,\n",
" 741,\n",
" 745,\n",
" 747,\n",
" 749,\n",
" 751,\n",
" 752,\n",
" 759,\n",
" 761,\n",
" 765,\n",
" 781,\n",
" 785,\n",
" 786,\n",
" 793,\n",
" 796,\n",
" 810,\n",
" 812,\n",
" 815,\n",
" 818,\n",
" 819,\n",
" 826,\n",
" 833,\n",
" 840,\n",
" 842,\n",
" 846,\n",
" 848,\n",
" 852,\n",
" 854,\n",
" 857,\n",
" 868,\n",
" 869,\n",
" 901,\n",
" 907,\n",
" 926,\n",
" 930,\n",
" 938,\n",
" 939,\n",
" 940,\n",
" 949,\n",
" 952,\n",
" 967,\n",
" 971,\n",
" 976,\n",
" 989,\n",
" 995,\n",
" 1009,\n",
" 1010,\n",
" 1011,\n",
" 1012,\n",
" 1013,\n",
" 1014],\n",
" 'pubchemBits': b'\\x00\\x1e\\x1c\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00,\\x06\\x83\\x00\\x00\\x00\\x00\\x00\\x00\\x80+\\x00\\x00X\\x00\\x00\\x10\\x00\\x00\\xb0 \\x01\\x19@Lp\\x03\\x00`\\x02\\x11@\\x05K\\x10\\x00@\\x10\\x00$\\x04\\x00\\x10\\x11\\x80`\\x10\\x130\\xe4lq\\xacXA\\x9e\\x06\\xa5\\x07\\xa8\\x10\\x9d\\xe1\\x11w?w\\x05\\x00@\\x10\\x00\\x10\\x00\\x00\\x02\\x00 \\x08\\x00\\x08',\n",
" 'pubchemBitsString': '00000000011110000011100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001101000110000011000001000000000000000000000000000000000000000000000000000000011101010000000000000000000001101000000000000000000000100000000000000000000000110100000100100000001001100000000010001100100000111011000000000000000000011001000000100010000000001010100000110100100000100000000000000000100000100000000000001001000010000000000000000010001000100000000001000001100000100011001000000011000010011100110110100011100011010100011010100000100111100101100000101001011110000000010101000010001011100110000111100010001110111011111100111011101010000000000000000000100000100000000000000010000000000000000000010000000000000000000100000100000000000000010',\n",
" 'citationDOI': [],\n",
" 'taxid': [],\n",
" 'textTaxa': ['notax'],\n",
" 'chemicalSuperClass': 'Phenylpropanoids and polyketides',\n",
" 'chemicalClass': 'Diarylheptanoids',\n",
" 'chemicalSubClass': 'Linear diarylheptanoids',\n",
" 'directParentClassification': 'Linear diarylheptanoids',\n",
" 'allChemClassifications': [],\n",
" 'taxonomyReferenceObjects': [],\n",
" 'allTaxa': [],\n",
" 'absolute_smiles_sources': {'nostereo': ['ibs2019mar_nc']},\n",
" 'absolute_smiles': {},\n",
" 'allWikidataIds': [],\n",
" 'alogp': 1.5698999999999994,\n",
" 'alogp2': 2.4645860099999983,\n",
" 'amralogp': 127.32220000000001,\n",
" 'apol': 62.147859999999966,\n",
" 'bcutDescriptor': [11.850000000000007,\n",
" 15.999952464292491,\n",
" -0.36071906962798755,\n",
" 0.2622629421493053,\n",
" 4.61844183790179,\n",
" 12.027281211337542],\n",
" 'bpol': 27.612139999999993,\n",
" 'eccentricConnectivityIndexDescriptor': 620,\n",
" 'fmfDescriptor': 0.7419354838709677,\n",
" 'fsp3': 0.12,\n",
" 'fragmentComplexityDescriptor': 1986.06,\n",
" 'gravitationalIndexHeavyAtoms': nan,\n",
" 'hBondAcceptorCount': 6,\n",
" 'hBondDonorCount': 2,\n",
" 'hybridizationRatioDescriptor': 0.12,\n",
" 'kappaShapeIndex1': 24.134948096885815,\n",
" 'kappaShapeIndex2': 10.950520833333334,\n",
" 'kappaShapeIndex3': 5.3994490358126725,\n",
" 'manholdlogp': 3.55,\n",
" 'petitjeanNumber': 0.5,\n",
" 'petitjeanShapeTopo': 1.0,\n",
" 'petitjeanShapeGeom': nan,\n",
" 'lipinskiRuleOf5Failures': 0,\n",
" 'numberSpiroAtoms': 0,\n",
" 'vabcDescriptor': nan,\n",
" 'vertexAdjMagnitude': 6.08746284125034,\n",
" 'weinerPathNumber': 2424.0,\n",
" 'weinerPolarityNumber': 54.0,\n",
" 'xlogp': 3.229,\n",
" 'zagrebIndex': 164.0,\n",
" 'topoPSA': 93.06000000000002,\n",
" 'tpsaEfficiency': 0.22363419389156305,\n",
" '_class': 'de.unijena.cheminf.npopensourcecollector.mongocollections.UniqueNaturalProduct'}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"db.uniqueNaturalProduct.find({})[1]"
]
},
{
"cell_type": "markdown",
"id": "c7917972",
"metadata": {},
"source": [
"### Natural Product Fingerprints "
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2d5bf4de",
"metadata": {},
"outputs": [],
"source": [
"cursor = db.uniqueNaturalProduct.find({},{'coconut_id':1,'circularFingerprint':1, 'extendedFingerprint':1 ,\n",
" 'pubchemFingerprint':1, 'pubchemBits':1, 'pubchemBitsString':1,'_id':0})\n",
"list_cur = list(cursor)\n",
"df = pd.DataFrame(list_cur)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "00a6e36b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>coconut_id</th>\n",
" <th>pubchemFingerprint</th>\n",
" <th>circularFingerprint</th>\n",
" <th>extendedFingerprint</th>\n",
" <th>pubchemBits</th>\n",
" <th>pubchemBitsString</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CNP0220816</td>\n",
" <td>[9, 10, 11, 12, 18, 19, 20, 21, 143, 147, 178,...</td>\n",
" <td>[14, 88, 133, 152, 169, 219, 222, 236, 244, 31...</td>\n",
" <td>[8, 9, 13, 14, 18, 28, 32, 37, 41, 53, 58, 61,...</td>\n",
" <td>b'\\x00\\x1e&lt;\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x0...</td>\n",
" <td>0000000001111000001111000000000000000000000000...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CNP0293916</td>\n",
" <td>[9, 10, 11, 12, 18, 19, 20, 178, 179, 181, 185...</td>\n",
" <td>[19, 41, 43, 76, 95, 133, 142, 152, 166, 206, ...</td>\n",
" <td>[0, 4, 9, 11, 13, 24, 28, 40, 41, 52, 53, 61, ...</td>\n",
" <td>b'\\x00\\x1e\\x1c\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00...</td>\n",
" <td>0000000001111000001110000000000000000000000000...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CNP0297651</td>\n",
" <td>[9, 10, 11, 12, 14, 18, 19, 20, 21, 178, 181, ...</td>\n",
" <td>[11, 14, 19, 25, 34, 57, 70, 82, 88, 133, 152,...</td>\n",
" <td>[8, 14, 18, 26, 32, 37, 53, 55, 62, 65, 68, 78...</td>\n",
" <td>b'\\x00^&lt;\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x...</td>\n",
" <td>0000000001111010001111000000000000000000000000...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CNP0330764</td>\n",
" <td>[9, 10, 11, 12, 18, 19, 20, 143, 144, 178, 179...</td>\n",
" <td>[10, 14, 19, 25, 57, 88, 93, 133, 192, 220, 22...</td>\n",
" <td>[8, 14, 58, 65, 68, 74, 82, 87, 92, 117, 118, ...</td>\n",
" <td>b'\\x00\\x1e\\x1c\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00...</td>\n",
" <td>0000000001111000001110000000000000000000000000...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CNP0125332</td>\n",
" <td>[9, 10, 18, 19, 20, 21, 30, 31, 143, 146, 283,...</td>\n",
" <td>[23, 53, 56, 88, 152, 169, 265, 338, 346, 368,...</td>\n",
" <td>[5, 8, 18, 20, 22, 25, 36, 65, 85, 115, 118, 1...</td>\n",
" <td>b'\\x00\\x06&lt;\\xc0\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x0...</td>\n",
" <td>0000000001100000001111000000001100000000000000...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" coconut_id pubchemFingerprint \\\n",
"0 CNP0220816 [9, 10, 11, 12, 18, 19, 20, 21, 143, 147, 178,... \n",
"1 CNP0293916 [9, 10, 11, 12, 18, 19, 20, 178, 179, 181, 185... \n",
"2 CNP0297651 [9, 10, 11, 12, 14, 18, 19, 20, 21, 178, 181, ... \n",
"3 CNP0330764 [9, 10, 11, 12, 18, 19, 20, 143, 144, 178, 179... \n",
"4 CNP0125332 [9, 10, 18, 19, 20, 21, 30, 31, 143, 146, 283,... \n",
"\n",
" circularFingerprint \\\n",
"0 [14, 88, 133, 152, 169, 219, 222, 236, 244, 31... \n",
"1 [19, 41, 43, 76, 95, 133, 142, 152, 166, 206, ... \n",
"2 [11, 14, 19, 25, 34, 57, 70, 82, 88, 133, 152,... \n",
"3 [10, 14, 19, 25, 57, 88, 93, 133, 192, 220, 22... \n",
"4 [23, 53, 56, 88, 152, 169, 265, 338, 346, 368,... \n",
"\n",
" extendedFingerprint \\\n",
"0 [8, 9, 13, 14, 18, 28, 32, 37, 41, 53, 58, 61,... \n",
"1 [0, 4, 9, 11, 13, 24, 28, 40, 41, 52, 53, 61, ... \n",
"2 [8, 14, 18, 26, 32, 37, 53, 55, 62, 65, 68, 78... \n",
"3 [8, 14, 58, 65, 68, 74, 82, 87, 92, 117, 118, ... \n",
"4 [5, 8, 18, 20, 22, 25, 36, 65, 85, 115, 118, 1... \n",
"\n",
" pubchemBits \\\n",
"0 b'\\x00\\x1e<\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x0... \n",
"1 b'\\x00\\x1e\\x1c\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00... \n",
"2 b'\\x00^<\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x... \n",
"3 b'\\x00\\x1e\\x1c\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00... \n",
"4 b'\\x00\\x06<\\xc0\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x0... \n",
"\n",
" pubchemBitsString \n",
"0 0000000001111000001111000000000000000000000000... \n",
"1 0000000001111000001110000000000000000000000000... \n",
"2 0000000001111010001111000000000000000000000000... \n",
"3 0000000001111000001110000000000000000000000000... \n",
"4 0000000001100000001111000000001100000000000000... "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "0c423866",
"metadata": {},
"source": [
"### Download Fingerprints as CSV"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7de3a635",
"metadata": {},
"outputs": [],
"source": [
"#run this to download all fingerprints as csv\n",
"df.to_csv('coconut_fingerprints.csv',index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment