Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save abdullahdevrel/1333cfcae6daa8bc6c71660f27b4b330 to your computer and use it in GitHub Desktop.
Save abdullahdevrel/1333cfcae6daa8bc6c71660f27b4b330 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"import ipaddress # for converting IP address (IPv4, IPv6) to their integer equivalent\n",
"import duckdb as db # pip install duckdb\n",
"from duckdb.typing import * # Used in the function declaration throguh Python's DuckDB API"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"# Establish connection to persistent local database\n",
"con = db.connect('country_asn.duckdb')"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x1846f1df8f0>"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Decalre the column names and types. DuckDB auto CSV reader is not very good. The database struggles when all the columns are text type.\n",
"country_asn_columns= {'start_ip': 'VARCHAR', 'end_ip': 'VARCHAR', 'country': 'VARCHAR', 'country_name': 'VARCHAR', 'continent': 'VARCHAR', 'continent_name': 'VARCHAR', 'asn': 'VARCHAR', 'as_name': 'VARCHAR', 'as_domain': 'VARCHAR'}\n",
"\n",
"# We will drop this table later. Calling it `country_asn_raw`\n",
"# The params to read the CSV is necessary or else DuckDB can not parse the CSV file properly.\n",
"con.execute(f\"CREATE TABLE country_asn_raw AS SELECT * FROM read_csv('country_asn.csv', columns = {country_asn_columns}, auto_detect='false', header='true');\")"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"# Function to convert the IP address str to integer\n",
"# We will use bring this function to DuckDB functions\n",
"ip_to_int = lambda ip : int(ipaddress.ip_address(ip))"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x1846f1df8f0>"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Using the declared ip_to_int Python lambda function to create the DuckDB function using Python's DuckDB API\n",
"# Takes VARCHAR (IP addresses)\n",
"# Returns UHUGEINT → 128 bit unsigned integer\n",
"con.create_function(\"to_int\", ip_to_int, [VARCHAR], UHUGEINT)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x1846f1df8f0>"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Creating the core table (country_asn) by creating integer versions of start_ip and end_ip columns\n",
"con.execute('''\n",
" CREATE TABLE country_asn AS\n",
" SELECT\n",
" *,\n",
" TO_INT(start_ip) start_ip_int,\n",
" TO_INT(end_ip) end_ip_int\n",
" FROM country_asn_raw\n",
"''')"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x1846f1df8f0>"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Dropping the original read_csv table\n",
"# Remember it is an OLAP database, we can not alter the original table.\n",
"con.execute(\"DROP TABLE country_asn_raw\")"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x1846f1df8f0>"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create a composite index on start_ip_int and end_ip_int\n",
"con.execute(\"CREATE INDEX idx_asns_start_end ON country_asn(start_ip_int, end_ip_int)\")"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('51.11.0.0',\n",
" '51.11.191.255',\n",
" 'GB',\n",
" 'United Kingdom',\n",
" 'EU',\n",
" 'Europe',\n",
" 'AS8075',\n",
" 'Microsoft Corporation',\n",
" 'microsoft.com',\n",
" 856358912,\n",
" 856408063)]"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# And we are done.\n",
"ip = \"51.11.128.92\"\n",
"con.execute(\"SELECT * FROM country_asn WHERE TO_INT(?) BETWEEN start_ip_int AND end_ip_int;\", [ip]).fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"con.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment