Queries based on the Citi bikes dataset using DuckDB.
Associated YouTube video: https://youtu.be/u9AtW5P0m5c
CREATE OR REPLACE TABLE bikeStations AS
FROM read_csv_auto('data/*.csv', types={"station_id": "VARCHAR"})
pip install confluent-kafka avro urllib3 requests fastavro |
# Dataset: https://www.kaggle.com/datasets/wilmerarltstrmberg/recipe-dataset-over-2m | |
import duckdb | |
db1 = duckdb.connect('db1.duck.db') | |
db2 = duckdb.connect('db2.duck.db') | |
db1.sql(""" | |
CREATE OR REPLACE TABLE recipes AS | |
FROM read_csv_auto('recipes_data.csv', header=True) |
Queries based on the Citi bikes dataset using DuckDB.
Associated YouTube video: https://youtu.be/u9AtW5P0m5c
CREATE OR REPLACE TABLE bikeStations AS
FROM read_csv_auto('data/*.csv', types={"station_id": "VARCHAR"})
Queries based on the Citi bikes dataset using DuckDB.
Associated YouTube video: https://www.youtube.com/watch?v=KTIBhsIoCvk
CREATE OR REPLACE TABLE bikeStations AS
import duckdb | |
import pandas as pd | |
con = duckdb.connect('atp-matches.db') | |
con.sql("INSTALL httpfs") | |
con.sql("LOAD httpfs") | |
csv_files = [ | |
f"https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{year}.csv" |
CREATE CONSTRAINT ON(u:User) | |
ASSERT u.id IS unique; | |
:param keysToKeep => ["name", "username", "bio", "following", "followers"]; | |
CALL apoc.load.json("https://gist.github.com/mneedham/3c6a59fb5e7d87e20a2f5f1ae4fa2920/raw/9d7c57997c09b3a105556adb6c6f1819792a4db4/query.json") | |
YIELD value | |
MERGE (u:User {id: value.user.id }) | |
SET u += value.user | |
FOREACH (following IN value.following | | |
MERGE (f1:User {id: following}) | |
MERGE (u)-[:FOLLOWS]->(f1)) |
CREATE OR REPLACE TABLE players | |
AS SELECT * FROM read_csv_auto('atp_players.csv', SAMPLE_SIZE=-1); | |
CREATE OR REPLACE TABLE rankings AS | |
select * | |
from 'atp_rankings_*.csv'; | |
SELECT player_id, name_first, name_last | |
FROM players | |
LIMIT 5; |
# Headless service to provide DNS lookup | |
apiVersion: v1 | |
kind: Service | |
metadata: | |
labels: | |
app: neo4j | |
name: neo4j | |
spec: | |
clusterIP: None | |
ports: |
import json | |
import os | |
import luigi | |
import requests | |
from collections import Counter | |
from luigi.contrib.external_program import ExternalProgramTask | |
class Meetup(luigi.WrapperTask): | |
def run(self): |
# Dataset from https://blogs.oracle.com/bigdataspatialgraph/intuitive-explanation-of-personalized-page-rank-and-its-application-in-recommendation | |
import operator | |
import networkx as nx | |
G = nx.Graph() | |
G.add_nodes_from(["John", "Mary", "Jill", "Todd", | |
"iPhone5", "Kindle Fire", "Fitbit Flex Wireless", "Harry Potter", "Hobbit"]) |