Created
March 3, 2024 22:59
-
-
Save abdullahdevrel/1333cfcae6daa8bc6c71660f27b4b330 to your computer and use it in GitHub Desktop.
IPinfo DuckDB experiment: https://community.ipinfo.io/t/hacktogether-setting-up-duckdb-to-use-ipinfos-ip-database-using-python-bad-code-looking-for-feedback/5538
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import ipaddress # for converting IP address (IPv4, IPv6) to their integer equivalent\n", | |
"import duckdb as db # pip install duckdb\n", | |
"from duckdb.typing import * # Used in the function declaration throguh Python's DuckDB API" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Establish connection to persistent local database\n", | |
"con = db.connect('country_asn.duckdb')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<duckdb.duckdb.DuckDBPyConnection at 0x1846f1df8f0>" | |
] | |
}, | |
"execution_count": 39, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Decalre the column names and types. DuckDB auto CSV reader is not very good. The database struggles when all the columns are text type.\n", | |
"country_asn_columns= {'start_ip': 'VARCHAR', 'end_ip': 'VARCHAR', 'country': 'VARCHAR', 'country_name': 'VARCHAR', 'continent': 'VARCHAR', 'continent_name': 'VARCHAR', 'asn': 'VARCHAR', 'as_name': 'VARCHAR', 'as_domain': 'VARCHAR'}\n", | |
"\n", | |
"# We will drop this table later. Calling it `country_asn_raw`\n", | |
"# The params to read the CSV is necessary or else DuckDB can not parse the CSV file properly.\n", | |
"con.execute(f\"CREATE TABLE country_asn_raw AS SELECT * FROM read_csv('country_asn.csv', columns = {country_asn_columns}, auto_detect='false', header='true');\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Function to convert the IP address str to integer\n", | |
"# We will use bring this function to DuckDB functions\n", | |
"ip_to_int = lambda ip : int(ipaddress.ip_address(ip))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<duckdb.duckdb.DuckDBPyConnection at 0x1846f1df8f0>" | |
] | |
}, | |
"execution_count": 41, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Using the declared ip_to_int Python lambda function to create the DuckDB function using Python's DuckDB API\n", | |
"# Takes VARCHAR (IP addresses)\n", | |
"# Returns UHUGEINT → 128 bit unsigned integer\n", | |
"con.create_function(\"to_int\", ip_to_int, [VARCHAR], UHUGEINT)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<duckdb.duckdb.DuckDBPyConnection at 0x1846f1df8f0>" | |
] | |
}, | |
"execution_count": 42, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Creating the core table (country_asn) by creating integer versions of start_ip and end_ip columns\n", | |
"con.execute('''\n", | |
" CREATE TABLE country_asn AS\n", | |
" SELECT\n", | |
" *,\n", | |
" TO_INT(start_ip) start_ip_int,\n", | |
" TO_INT(end_ip) end_ip_int\n", | |
" FROM country_asn_raw\n", | |
"''')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<duckdb.duckdb.DuckDBPyConnection at 0x1846f1df8f0>" | |
] | |
}, | |
"execution_count": 43, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Dropping the original read_csv table\n", | |
"# Remember it is an OLAP database, we can not alter the original table.\n", | |
"con.execute(\"DROP TABLE country_asn_raw\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<duckdb.duckdb.DuckDBPyConnection at 0x1846f1df8f0>" | |
] | |
}, | |
"execution_count": 44, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Create a composite index on start_ip_int and end_ip_int\n", | |
"con.execute(\"CREATE INDEX idx_asns_start_end ON country_asn(start_ip_int, end_ip_int)\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('51.11.0.0',\n", | |
" '51.11.191.255',\n", | |
" 'GB',\n", | |
" 'United Kingdom',\n", | |
" 'EU',\n", | |
" 'Europe',\n", | |
" 'AS8075',\n", | |
" 'Microsoft Corporation',\n", | |
" 'microsoft.com',\n", | |
" 856358912,\n", | |
" 856408063)]" | |
] | |
}, | |
"execution_count": 45, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# And we are done.\n", | |
"ip = \"51.11.128.92\"\n", | |
"con.execute(\"SELECT * FROM country_asn WHERE TO_INT(?) BETWEEN start_ip_int AND end_ip_int;\", [ip]).fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"con.close()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.12.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment