Skip to content

Instantly share code, notes, and snippets.

@dlchet
Created January 13, 2023 13:31
Show Gist options
  • Save dlchet/b88a88b5e317fa8df9d34b408002db45 to your computer and use it in GitHub Desktop.
Save dlchet/b88a88b5e317fa8df9d34b408002db45 to your computer and use it in GitHub Desktop.
dask-book-chap2.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyNNZkNbnrELXwmnamt5NHTb",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/dlchet/b88a88b5e317fa8df9d34b408002db45/dask-book-chap2.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"from google.colab import files\n",
"files.upload()\n",
"print(\"done uploading\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 94
},
"id": "VhAodGxo5pBe",
"outputId": "6ff4d851-7152-4f6a-a464-ba6be58b8fba"
},
"execution_count": 14,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"<IPython.core.display.HTML object>"
],
"text/html": [
"\n",
" <input type=\"file\" id=\"files-a56957bc-2980-441a-8468-13ea28fd71f5\" name=\"files[]\" multiple disabled\n",
" style=\"border:none\" />\n",
" <output id=\"result-a56957bc-2980-441a-8468-13ea28fd71f5\">\n",
" Upload widget is only available when the cell has been executed in the\n",
" current browser session. Please rerun this cell to enable.\n",
" </output>\n",
" <script>// Copyright 2017 Google LLC\n",
"//\n",
"// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"// you may not use this file except in compliance with the License.\n",
"// You may obtain a copy of the License at\n",
"//\n",
"// http://www.apache.org/licenses/LICENSE-2.0\n",
"//\n",
"// Unless required by applicable law or agreed to in writing, software\n",
"// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"// See the License for the specific language governing permissions and\n",
"// limitations under the License.\n",
"\n",
"/**\n",
" * @fileoverview Helpers for google.colab Python module.\n",
" */\n",
"(function(scope) {\n",
"function span(text, styleAttributes = {}) {\n",
" const element = document.createElement('span');\n",
" element.textContent = text;\n",
" for (const key of Object.keys(styleAttributes)) {\n",
" element.style[key] = styleAttributes[key];\n",
" }\n",
" return element;\n",
"}\n",
"\n",
"// Max number of bytes which will be uploaded at a time.\n",
"const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
"\n",
"function _uploadFiles(inputId, outputId) {\n",
" const steps = uploadFilesStep(inputId, outputId);\n",
" const outputElement = document.getElementById(outputId);\n",
" // Cache steps on the outputElement to make it available for the next call\n",
" // to uploadFilesContinue from Python.\n",
" outputElement.steps = steps;\n",
"\n",
" return _uploadFilesContinue(outputId);\n",
"}\n",
"\n",
"// This is roughly an async generator (not supported in the browser yet),\n",
"// where there are multiple asynchronous steps and the Python side is going\n",
"// to poll for completion of each step.\n",
"// This uses a Promise to block the python side on completion of each step,\n",
"// then passes the result of the previous step as the input to the next step.\n",
"function _uploadFilesContinue(outputId) {\n",
" const outputElement = document.getElementById(outputId);\n",
" const steps = outputElement.steps;\n",
"\n",
" const next = steps.next(outputElement.lastPromiseValue);\n",
" return Promise.resolve(next.value.promise).then((value) => {\n",
" // Cache the last promise value to make it available to the next\n",
" // step of the generator.\n",
" outputElement.lastPromiseValue = value;\n",
" return next.value.response;\n",
" });\n",
"}\n",
"\n",
"/**\n",
" * Generator function which is called between each async step of the upload\n",
" * process.\n",
" * @param {string} inputId Element ID of the input file picker element.\n",
" * @param {string} outputId Element ID of the output display.\n",
" * @return {!Iterable<!Object>} Iterable of next steps.\n",
" */\n",
"function* uploadFilesStep(inputId, outputId) {\n",
" const inputElement = document.getElementById(inputId);\n",
" inputElement.disabled = false;\n",
"\n",
" const outputElement = document.getElementById(outputId);\n",
" outputElement.innerHTML = '';\n",
"\n",
" const pickedPromise = new Promise((resolve) => {\n",
" inputElement.addEventListener('change', (e) => {\n",
" resolve(e.target.files);\n",
" });\n",
" });\n",
"\n",
" const cancel = document.createElement('button');\n",
" inputElement.parentElement.appendChild(cancel);\n",
" cancel.textContent = 'Cancel upload';\n",
" const cancelPromise = new Promise((resolve) => {\n",
" cancel.onclick = () => {\n",
" resolve(null);\n",
" };\n",
" });\n",
"\n",
" // Wait for the user to pick the files.\n",
" const files = yield {\n",
" promise: Promise.race([pickedPromise, cancelPromise]),\n",
" response: {\n",
" action: 'starting',\n",
" }\n",
" };\n",
"\n",
" cancel.remove();\n",
"\n",
" // Disable the input element since further picks are not allowed.\n",
" inputElement.disabled = true;\n",
"\n",
" if (!files) {\n",
" return {\n",
" response: {\n",
" action: 'complete',\n",
" }\n",
" };\n",
" }\n",
"\n",
" for (const file of files) {\n",
" const li = document.createElement('li');\n",
" li.append(span(file.name, {fontWeight: 'bold'}));\n",
" li.append(span(\n",
" `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
" `last modified: ${\n",
" file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
" 'n/a'} - `));\n",
" const percent = span('0% done');\n",
" li.appendChild(percent);\n",
"\n",
" outputElement.appendChild(li);\n",
"\n",
" const fileDataPromise = new Promise((resolve) => {\n",
" const reader = new FileReader();\n",
" reader.onload = (e) => {\n",
" resolve(e.target.result);\n",
" };\n",
" reader.readAsArrayBuffer(file);\n",
" });\n",
" // Wait for the data to be ready.\n",
" let fileData = yield {\n",
" promise: fileDataPromise,\n",
" response: {\n",
" action: 'continue',\n",
" }\n",
" };\n",
"\n",
" // Use a chunked sending to avoid message size limits. See b/62115660.\n",
" let position = 0;\n",
" do {\n",
" const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
" const chunk = new Uint8Array(fileData, position, length);\n",
" position += length;\n",
"\n",
" const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
" yield {\n",
" response: {\n",
" action: 'append',\n",
" file: file.name,\n",
" data: base64,\n",
" },\n",
" };\n",
"\n",
" let percentDone = fileData.byteLength === 0 ?\n",
" 100 :\n",
" Math.round((position / fileData.byteLength) * 100);\n",
" percent.textContent = `${percentDone}% done`;\n",
"\n",
" } while (position < fileData.byteLength);\n",
" }\n",
"\n",
" // All done.\n",
" yield {\n",
" response: {\n",
" action: 'complete',\n",
" }\n",
" };\n",
"}\n",
"\n",
"scope.google = scope.google || {};\n",
"scope.google.colab = scope.google.colab || {};\n",
"scope.google.colab._files = {\n",
" _uploadFiles,\n",
" _uploadFilesContinue,\n",
"};\n",
"})(self);\n",
"</script> "
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Saving kaggle.json to kaggle (1).json\n",
"done uploading\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"! mkdir ~/.kaggle\n",
"! cp kaggle.json ~/.kaggle/\n",
"! chmod 600 ~/.kaggle/kaggle.json\n",
"! pip install -q kaggle"
],
"metadata": {
"id": "E2-8Yk1l6Us-"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "dyjBGkAKYGt7",
"outputId": "d26d3129-6e9f-44d3-81a1-1ba3ad5921f0"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Downloading nyc-parking-tickets.zip to /content\n",
"100% 2.01G/2.02G [00:24<00:00, 74.6MB/s]\n",
"100% 2.02G/2.02G [00:24<00:00, 87.2MB/s]\n",
"Archive: nyc-parking-tickets.zip\n",
" inflating: Parking_Violations_Issued_-_Fiscal_Year_2014__August_2013___June_2014_.csv \n",
" inflating: Parking_Violations_Issued_-_Fiscal_Year_2015.csv \n",
" inflating: Parking_Violations_Issued_-_Fiscal_Year_2016.csv \n",
" inflating: Parking_Violations_Issued_-_Fiscal_Year_2017.csv \n"
]
}
],
"source": [
"import dask.dataframe as dd\n",
"from dask.diagnostics import ProgressBar\n",
"from matplotlib import pyplot as plt\n",
"\n",
"! kaggle datasets download -d new-york-city/nyc-parking-tickets\n",
"! unzip nyc-parking-tickets.zip"
]
},
{
"cell_type": "code",
"source": [
"df = dd.read_csv('*2017.csv', dtype={'House Number': 'object',\n",
" 'Time First Observed': 'object'})\n",
"df"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 357
},
"id": "TGa4eQ_k9odV",
"outputId": "5d4b02c7-7435-4cc5-fcd6-21a09471f6ff"
},
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Dask DataFrame Structure:\n",
" Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 Street Code2 Street Code3 Vehicle Expiration Date Violation Location Violation Precinct Issuer Precinct Issuer Code Issuer Command Issuer Squad Violation Time Time First Observed Violation County Violation In Front Of Or Opposite House Number Street Name Intersecting Street Date First Observed Law Section Sub Division Violation Legal Code Days Parking In Effect From Hours In Effect To Hours In Effect Vehicle Color Unregistered Vehicle? Vehicle Year Meter Number Feet From Curb Violation Post Code Violation Description No Standing or Stopping Violation Hydrant Violation Double Parking Violation\n",
"npartitions=32 \n",
" int64 object object object object int64 object object object int64 int64 int64 int64 float64 int64 int64 int64 object object object object object object object object object int64 int64 object object object object object object float64 int64 object int64 object object float64 float64 float64\n",
" ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n",
"... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n",
" ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n",
" ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n",
"Dask Name: read-csv, 32 tasks"
],
"text/html": [
"<div><strong>Dask DataFrame Structure:</strong></div>\n",
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Summons Number</th>\n",
" <th>Plate ID</th>\n",
" <th>Registration State</th>\n",
" <th>Plate Type</th>\n",
" <th>Issue Date</th>\n",
" <th>Violation Code</th>\n",
" <th>Vehicle Body Type</th>\n",
" <th>Vehicle Make</th>\n",
" <th>Issuing Agency</th>\n",
" <th>Street Code1</th>\n",
" <th>Street Code2</th>\n",
" <th>Street Code3</th>\n",
" <th>Vehicle Expiration Date</th>\n",
" <th>Violation Location</th>\n",
" <th>Violation Precinct</th>\n",
" <th>Issuer Precinct</th>\n",
" <th>Issuer Code</th>\n",
" <th>Issuer Command</th>\n",
" <th>Issuer Squad</th>\n",
" <th>Violation Time</th>\n",
" <th>Time First Observed</th>\n",
" <th>Violation County</th>\n",
" <th>Violation In Front Of Or Opposite</th>\n",
" <th>House Number</th>\n",
" <th>Street Name</th>\n",
" <th>Intersecting Street</th>\n",
" <th>Date First Observed</th>\n",
" <th>Law Section</th>\n",
" <th>Sub Division</th>\n",
" <th>Violation Legal Code</th>\n",
" <th>Days Parking In Effect</th>\n",
" <th>From Hours In Effect</th>\n",
" <th>To Hours In Effect</th>\n",
" <th>Vehicle Color</th>\n",
" <th>Unregistered Vehicle?</th>\n",
" <th>Vehicle Year</th>\n",
" <th>Meter Number</th>\n",
" <th>Feet From Curb</th>\n",
" <th>Violation Post Code</th>\n",
" <th>Violation Description</th>\n",
" <th>No Standing or Stopping Violation</th>\n",
" <th>Hydrant Violation</th>\n",
" <th>Double Parking Violation</th>\n",
" </tr>\n",
" <tr>\n",
" <th>npartitions=32</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th></th>\n",
" <td>int64</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>int64</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>int64</td>\n",
" <td>int64</td>\n",
" <td>int64</td>\n",
" <td>int64</td>\n",
" <td>float64</td>\n",
" <td>int64</td>\n",
" <td>int64</td>\n",
" <td>int64</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>int64</td>\n",
" <td>int64</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>float64</td>\n",
" <td>int64</td>\n",
" <td>object</td>\n",
" <td>int64</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>float64</td>\n",
" <td>float64</td>\n",
" <td>float64</td>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
"<div>Dask Name: read-csv, 32 tasks</div>"
]
},
"metadata": {},
"execution_count": 10
}
]
},
{
"cell_type": "code",
"source": [
"with ProgressBar():\n",
" final_size = df.index.size.compute()\n",
"\n",
"final_size"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pXMe0R6l90hn",
"outputId": "f62fbfb9-b247-4775-d557-bd6c115ac59e"
},
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[##################################### ] | 93% Completed | 1min 5.4s"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.8/dist-packages/dask/dataframe/io/csv.py:128: DtypeWarning: Columns (18,38) have mixed types.Specify dtype option on import or set low_memory=False.\n",
" df = pandas_read_text(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\r[########################################] | 100% Completed | 1min 5.5s\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"10803028"
]
},
"metadata": {},
"execution_count": 13
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment