Skip to content

Instantly share code, notes, and snippets.

@ermakovpetr
Created May 20, 2016 07:33
Show Gist options
  • Save ermakovpetr/e71af09554f39ec8f5276a122ffaa019 to your computer and use it in GitHub Desktop.
Save ermakovpetr/e71af09554f39ec8f5276a122ffaa019 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from utils.database_utils import *"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>employer_id</th>\n",
" <th>manager_id</th>\n",
" <th>creation_time</th>\n",
" <th>area_id</th>\n",
" <th>name</th>\n",
" <th>agreement_id</th>\n",
" <th>category</th>\n",
" <th>url</th>\n",
" <th>group_name</th>\n",
" <th>on_firstpage</th>\n",
" <th>...</th>\n",
" <th>vacancy_page_template</th>\n",
" <th>country_id</th>\n",
" <th>employees_number</th>\n",
" <th>registration_completed</th>\n",
" <th>dublicated_employer_b</th>\n",
" <th>corporate_gift_b</th>\n",
" <th>hide_type_b</th>\n",
" <th>view_similar_vacancy_b</th>\n",
" <th>must_discard_by_robot_b</th>\n",
" <th>hidden</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>77453</td>\n",
" <td>NaN</td>\n",
" <td>2007-03-23 09:52:06.000</td>\n",
" <td>1</td>\n",
" <td>СС</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>http://</td>\n",
" <td>77453</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1654338</td>\n",
" <td>NaN</td>\n",
" <td>2014-09-29 18:08:55.776</td>\n",
" <td>2</td>\n",
" <td>бургер кинг</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td></td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows × 50 columns</p>\n",
"</div>"
],
"text/plain": [
" employer_id manager_id creation_time area_id name \\\n",
"0 77453 NaN 2007-03-23 09:52:06.000 1 СС \n",
"1 1654338 NaN 2014-09-29 18:08:55.776 2 бургер кинг \n",
"\n",
" agreement_id category url group_name on_firstpage ... \\\n",
"0 0 http:// 77453 0 ... \n",
"1 0 None 0 ... \n",
"\n",
" vacancy_page_template country_id employees_number registration_completed \\\n",
"0 None 1 NaN None \n",
"1 None 1 1.0 False \n",
"\n",
" dublicated_employer_b corporate_gift_b hide_type_b \\\n",
"0 False False False \n",
"1 False False False \n",
"\n",
" view_similar_vacancy_b must_discard_by_robot_b hidden \n",
"0 True False False \n",
"1 True False False \n",
"\n",
"[2 rows x 50 columns]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# сделать обычный запрос, можно делать запросы\n",
"# load_pssql, load_pssql_data, load_mssql, load_hive\n",
"\n",
"employer = load_pssql('SELECT * FROM employer LIMIT 1000')\n",
"employer.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1000 entries, 0 to 999\n",
"Data columns (total 50 columns):\n",
"employer_id 1000 non-null int64\n",
"manager_id 77 non-null float64\n",
"creation_time 1000 non-null datetime64[ns]\n",
"area_id 1000 non-null int64\n",
"name 1000 non-null object\n",
"agreement_id 1000 non-null object\n",
"category 1000 non-null int64\n",
"url 999 non-null object\n",
"group_name 162 non-null object\n",
"on_firstpage 1000 non-null int64\n",
"in_best_list 1000 non-null int64\n",
"delivery_address 2 non-null object\n",
"discard_status 1000 non-null int64\n",
"size 1000 non-null int64\n",
"pay_readiness 1000 non-null int64\n",
"own_policy 1000 non-null int64\n",
"employer_branch_head_company_action_type 1000 non-null int64\n",
"employer_branch_type 1000 non-null int64\n",
"owning_employer 953 non-null float64\n",
"email_for_vacancy_responces 0 non-null object\n",
"resume_views 1000 non-null int64\n",
"bank_detail_id 28 non-null float64\n",
"delivery_person 2 non-null object\n",
"main_insider_id 1000 non-null int64\n",
"vacancy_response_replay 1000 non-null bool\n",
"open_vacancies_number 1000 non-null int64\n",
"how_did_you_hear_about_us_id 488 non-null float64\n",
"resume_views_limit 1000 non-null int64\n",
"registration_platform_id 1000 non-null int64\n",
"parent_id 0 non-null object\n",
"structure_name 0 non-null object\n",
"discard_status_employer_id 306 non-null float64\n",
"small_logo_url 0 non-null object\n",
"talentlink_client 0 non-null object\n",
"page_template 0 non-null object\n",
"organization_form 406 non-null float64\n",
"state 1000 non-null int64\n",
"last_change_time 825 non-null datetime64[ns]\n",
"confirmed_anonymous 1000 non-null bool\n",
"site_id 712 non-null float64\n",
"vacancy_page_template 0 non-null object\n",
"country_id 1000 non-null int64\n",
"employees_number 361 non-null float64\n",
"registration_completed 377 non-null object\n",
"dublicated_employer_b 1000 non-null bool\n",
"corporate_gift_b 1000 non-null bool\n",
"hide_type_b 1000 non-null bool\n",
"view_similar_vacancy_b 1000 non-null bool\n",
"must_discard_by_robot_b 1000 non-null bool\n",
"hidden 1000 non-null bool\n",
"dtypes: bool(8), datetime64[ns](2), float64(8), int64(18), object(14)\n",
"memory usage: 336.0+ KB\n"
]
}
],
"source": [
"employer.info()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"log: write cache\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>resume.datetime</th>\n",
" <th>resume.value</th>\n",
" <th>resume.year</th>\n",
" <th>resume.month</th>\n",
" <th>resume.day</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2015-12-31 00:23:04,191</td>\n",
" <td>{\"id\":24406648,\"userId\":2530514,\"platform\":\"HE...</td>\n",
" <td>2016</td>\n",
" <td>01</td>\n",
" <td>01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2015-12-31 00:23:04,201</td>\n",
" <td>{\"id\":47975617,\"userId\":2530514,\"platform\":\"HE...</td>\n",
" <td>2016</td>\n",
" <td>01</td>\n",
" <td>01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2015-12-31 00:23:04,214</td>\n",
" <td>{\"id\":48062731,\"userId\":2530514,\"platform\":\"HE...</td>\n",
" <td>2016</td>\n",
" <td>01</td>\n",
" <td>01</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" resume.datetime resume.value \\\n",
"0 2015-12-31 00:23:04,191 {\"id\":24406648,\"userId\":2530514,\"platform\":\"HE... \n",
"1 2015-12-31 00:23:04,201 {\"id\":47975617,\"userId\":2530514,\"platform\":\"HE... \n",
"2 2015-12-31 00:23:04,214 {\"id\":48062731,\"userId\":2530514,\"platform\":\"HE... \n",
"\n",
" resume.year resume.month resume.day \n",
"0 2016 01 01 \n",
"1 2016 01 01 \n",
"2 2016 01 01 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# можно закэшировать результат запроса\n",
"\n",
"load_hive(\"SELECT * FROM moderation.resume WHERE year = '2016' LIMIT 3\", write_cache=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"log: read cache\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>resume.datetime</th>\n",
" <th>resume.value</th>\n",
" <th>resume.year</th>\n",
" <th>resume.month</th>\n",
" <th>resume.day</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2015-12-31 00:23:04,191</td>\n",
" <td>{\"id\":24406648,\"userId\":2530514,\"platform\":\"HE...</td>\n",
" <td>2016</td>\n",
" <td>01</td>\n",
" <td>01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2015-12-31 00:23:04,201</td>\n",
" <td>{\"id\":47975617,\"userId\":2530514,\"platform\":\"HE...</td>\n",
" <td>2016</td>\n",
" <td>01</td>\n",
" <td>01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2015-12-31 00:23:04,214</td>\n",
" <td>{\"id\":48062731,\"userId\":2530514,\"platform\":\"HE...</td>\n",
" <td>2016</td>\n",
" <td>01</td>\n",
" <td>01</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" resume.datetime resume.value \\\n",
"0 2015-12-31 00:23:04,191 {\"id\":24406648,\"userId\":2530514,\"platform\":\"HE... \n",
"1 2015-12-31 00:23:04,201 {\"id\":47975617,\"userId\":2530514,\"platform\":\"HE... \n",
"2 2015-12-31 00:23:04,214 {\"id\":48062731,\"userId\":2530514,\"platform\":\"HE... \n",
"\n",
" resume.year resume.month resume.day \n",
"0 2016 01 01 \n",
"1 2016 01 01 \n",
"2 2016 01 01 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# можно прочитать результат запроса из кэша, если он есть (выше мы записали)\n",
"\n",
"load_hive(\"SELECT * FROM moderation.resume WHERE year = '2016' LIMIT 3\", read_cache=True)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>resume.datetime</th>\n",
" <th>resume.value</th>\n",
" <th>resume.year</th>\n",
" <th>resume.month</th>\n",
" <th>resume.day</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2015-10-02 04:35:55,243</td>\n",
" <td>{\"id\":42817070,\"userId\":24395671,\"platform\":\"H...</td>\n",
" <td>2015</td>\n",
" <td>10</td>\n",
" <td>02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2015-10-02 05:24:20,863</td>\n",
" <td>{\"id\":31667687,\"userId\":18829490,\"platform\":\"H...</td>\n",
" <td>2015</td>\n",
" <td>10</td>\n",
" <td>02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2015-10-02 07:36:46,968</td>\n",
" <td>{\"id\":45251573,\"userId\":25269223,\"platform\":\"H...</td>\n",
" <td>2015</td>\n",
" <td>10</td>\n",
" <td>02</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" resume.datetime resume.value \\\n",
"0 2015-10-02 04:35:55,243 {\"id\":42817070,\"userId\":24395671,\"platform\":\"H... \n",
"1 2015-10-02 05:24:20,863 {\"id\":31667687,\"userId\":18829490,\"platform\":\"H... \n",
"2 2015-10-02 07:36:46,968 {\"id\":45251573,\"userId\":25269223,\"platform\":\"H... \n",
"\n",
" resume.year resume.month resume.day \n",
"0 2015 10 02 \n",
"1 2015 10 02 \n",
"2 2015 10 02 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# результат кэшируется исходя из запроса, меняем запрос и пытаемся прочитать из кэша\n",
"# из кэша не прочитанно - ибо для данного запроса кэш не был сохранен\n",
"\n",
"load_hive(\"SELECT * FROM moderation.resume WHERE year = '2015' LIMIT 3\", read_cache=True)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"log: write cache\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>resume.datetime</th>\n",
" <th>resume.value</th>\n",
" <th>resume.year</th>\n",
" <th>resume.month</th>\n",
" <th>resume.day</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2015-10-02 04:35:55,243</td>\n",
" <td>{\"id\":42817070,\"userId\":24395671,\"platform\":\"H...</td>\n",
" <td>2015</td>\n",
" <td>10</td>\n",
" <td>02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2015-10-02 05:24:20,863</td>\n",
" <td>{\"id\":31667687,\"userId\":18829490,\"platform\":\"H...</td>\n",
" <td>2015</td>\n",
" <td>10</td>\n",
" <td>02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2015-10-02 07:36:46,968</td>\n",
" <td>{\"id\":45251573,\"userId\":25269223,\"platform\":\"H...</td>\n",
" <td>2015</td>\n",
" <td>10</td>\n",
" <td>02</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" resume.datetime resume.value \\\n",
"0 2015-10-02 04:35:55,243 {\"id\":42817070,\"userId\":24395671,\"platform\":\"H... \n",
"1 2015-10-02 05:24:20,863 {\"id\":31667687,\"userId\":18829490,\"platform\":\"H... \n",
"2 2015-10-02 07:36:46,968 {\"id\":45251573,\"userId\":25269223,\"platform\":\"H... \n",
"\n",
" resume.year resume.month resume.day \n",
"0 2015 10 02 \n",
"1 2015 10 02 \n",
"2 2015 10 02 "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# можно пытаться читать из кэша, а если читать нечего, то запрос будет выполнен, а результат сохранен\n",
"\n",
"load_hive(\"SELECT * FROM moderation.resume WHERE year = '2015' LIMIT 3\", read_cache=True, write_cache=True)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"__cache_wrapper_-1479390422386652675.pckl\r\n",
"__cache_wrapper_-4844589160831331624.pckl\r\n",
"__cache_wrapper_-6307272719582419216.pckl\r\n",
"__cache_wrapper_1254068372247203274.pckl\r\n"
]
}
],
"source": [
"# кэш файл кладется в папку вместе с py-файлом из которого вы вызываете функции и выглядит так\n",
"\n",
"!ls __cache_*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2 (upstream libs)",
"language": "python",
"name": "python2-upstream"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment