Skip to content

Instantly share code, notes, and snippets.

@avriiil
avriiil / download-kaggle.py
Created November 14, 2022 15:19
Download Kaggle NYC bike data for Dask benchmarking
import json
from pathlib import Path
import duckdb
import kaggle
from loguru import logger
from tqdm import tqdm
# In order to access this data, you must create a Kaggle account and obtain an API key.
# You can obtain a key by clicking on your icon on the upper right of the homepage,
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@avriiil
avriiil / geopandas-explore.py
Created May 11, 2022 10:39
Create Interactive Maps with GeoPandas
import pandas as pd
import geopandas as gpd
from geopandas import points_from_xy
from shapely.geometry import Point
# read in raw taxi data
df = pd.read_csv(
"s3://nyc-tlc/trip data/yellow_tripdata_2012-01.csv",
nrows=100_000,
)
@avriiil
avriiil / write_large_parquet_m1
Created January 18, 2022 10:48
Gist to write large parquet files to S3 on M1 (avoid blosc issues)
# ...spin up cluster...connect Dask...etc.
# use client.submit() to write large parquet files to S3 (to avoid blosc issues on M1)
def submit_jobs():
from distributed import get_client
with get_client() as client:
large = dask.datasets.timeseries(start="2000", end="2015", freq="10s", partition_freq="1M")
large.to_parquet(
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# import library from gensim
from gensim.models import CoherenceModel
# instantiate topic coherence model
cm = CoherenceModel(model=lda_model_15, corpus=bow_corpus, texts=docs, coherence='c_v')
# get topic coherence score
coherence_lda = cm.get_coherence()
print(coherence_lda)
# Import wordcloud library
from wordcloud import WordCloud
# Get topic word distributions from gsdmm model
cluster_word_distribution = gsdmm.cluster_word_distribution
# Select topic you want to output as dictionary (using topic_number)
topic_dict = sorted(cluster_word_distribution[topic_number].items(), key=lambda k: k[1], reverse=True)[:values]
# Generate a word cloud image
# import library from gensim
from gensim.models import CoherenceModel
# define function to get words in topics
def get_topics_lists(model, top_clusters, n_words):
'''
Gets lists of words in topics as a list of lists.
model: gsdmm instance
top_clusters: numpy array containing indices of top_clusters
# print number of documents per topic
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)
# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
# importing libraries
import pandas as pd
import numpy as np
import gensim
from gsdmm import MovieGroupProcess
# cast tweets to numpy array
docs = df.tweet_text.to_numpy()
# create dictionary of all words in all documents