Created
August 24, 2018 19:51
-
-
Save phenders/e784d5735624368bc86e41bcd2ab54de to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"580" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import requests\n", | |
"\n", | |
"session = requests.Session()\n", | |
"headers = {'authorization': 'Bearer ' + '<YOUR_API_KEY>'}\n", | |
"session.headers.update(headers)\n", | |
"\n", | |
"url = 'https://public.enigma.com/api/datasets/'\n", | |
"phrase = 'michael cohen'\n", | |
"params = {'query': phrase, \n", | |
" 'row_limit':1000, \n", | |
" 'match_metadata':'false', \n", | |
" 'include_serialids':'false'\n", | |
" }\n", | |
"response = session.head(url, headers=headers, params=params)\n", | |
"ds_count = int(response.headers.get('content-range').split(\"/\")[1])\n", | |
"ds_count" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"580" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import enigma\n", | |
"\n", | |
"public = enigma.Public()\n", | |
"public.set_auth(apikey='YOUR-API-KEY')\n", | |
"\n", | |
"phrase = 'michael cohen'\n", | |
"datasets = public.datasets.list(\n", | |
" query=phrase, \n", | |
" row_limit=1000, \n", | |
" match_metadata=False, \n", | |
" include_serialids=False\n", | |
").all()\n", | |
"len(datasets)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"24432" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"results = []\n", | |
"for i in range(0, ds_count, 10):\n", | |
" headers['Range'] = 'resources={}-{}'.format(i, i + 9)\n", | |
" session.headers.update(headers)\n", | |
" response = session.get(url, headers=headers, params=params).json()\n", | |
" for dataset in response:\n", | |
" results.append([dataset['display_name']])\n", | |
" results.append(dataset['current_snapshot']['table_rows']['fields'])\n", | |
" rows = dataset['current_snapshot']['table_rows']['rows']\n", | |
" for row in rows:\n", | |
" row = [value[:100] if isinstance(value, str) else '' for value in row]\n", | |
" results.append(row)\n", | |
"len(results)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"24432" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"results = []\n", | |
"for dataset in datasets:\n", | |
" tableview = dataset.current_snapshot.table_rows\n", | |
" results.append([dataset.display_name] + [''] * 10)\n", | |
" results.append([field.display_name for field in tableview.fields])\n", | |
" for row in tableview:\n", | |
" row = [value[:100] if isinstance(value, str) else '' for value in row]\n", | |
" results.append(row)\n", | |
"len(results)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import string\n", | |
"\n", | |
"table = str.maketrans('', '', string.punctuation)\n", | |
"\n", | |
"results = []\n", | |
"distance = 2\n", | |
"\n", | |
"for dataset in datasets:\n", | |
" tableview = dataset.current_snapshot.table_rows\n", | |
" first_match = True\n", | |
" for row in tableview:\n", | |
" row = [str(value) if isinstance(value, bool) else value for value in row ]\n", | |
" match = False\n", | |
" for j in range (0, len(row) - distance + 1):\n", | |
" words = (' '.join(filter(None, row[j: j + distance]))).lower().translate(table)\n", | |
" if all(s in words.split() for s in phrase.lower().split()):\n", | |
" match = True\n", | |
" break\n", | |
" if match:\n", | |
" if first_match:\n", | |
" results.append([dataset.display_name] + [''] * 10)\n", | |
" results.append([field.display_name for field in tableview.fields])\n", | |
" first_match = False\n", | |
" row = [value[:100] if isinstance(value, str) else '' for value in row ]\n", | |
" results.append(row)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"\n", | |
"df = pd.DataFrame(results)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment