Skip to content

Instantly share code, notes, and snippets.

# importing libraries
import pandas as pd
import numpy as np
import gensim
from gsdmm import MovieGroupProcess
# cast tweets to numpy array
docs = df.tweet_text.to_numpy()
# create dictionary of all words in all documents
# print number of documents per topic
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)
# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
# import library from gensim
from gensim.models import CoherenceModel
# define function to get words in topics
def get_topics_lists(model, top_clusters, n_words):
'''
Gets lists of words in topics as a list of lists.
model: gsdmm instance
top_clusters: numpy array containing indices of top_clusters
# Import wordcloud library
from wordcloud import WordCloud
# Get topic word distributions from gsdmm model
cluster_word_distribution = gsdmm.cluster_word_distribution
# Select topic you want to output as dictionary (using topic_number)
topic_dict = sorted(cluster_word_distribution[topic_number].items(), key=lambda k: k[1], reverse=True)[:values]
# Generate a word cloud image
# import library from gensim
from gensim.models import CoherenceModel
# instantiate topic coherence model
cm = CoherenceModel(model=lda_model_15, corpus=bow_corpus, texts=docs, coherence='c_v')
# get topic coherence score
coherence_lda = cm.get_coherence()
print(coherence_lda)
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@avriiil
avriiil / write_large_parquet_m1
Created January 18, 2022 10:48
Gist to write large parquet files to S3 on M1 (avoid blosc issues)
# ...spin up cluster...connect Dask...etc.
# use client.submit() to write large parquet files to S3 (to avoid blosc issues on M1)
def submit_jobs():
from distributed import get_client
with get_client() as client:
large = dask.datasets.timeseries(start="2000", end="2015", freq="10s", partition_freq="1M")
large.to_parquet(
@avriiil
avriiil / geopandas-explore.py
Created May 11, 2022 10:39
Create Interactive Maps with GeoPandas
import pandas as pd
import geopandas as gpd
from geopandas import points_from_xy
from shapely.geometry import Point
# read in raw taxi data
df = pd.read_csv(
"s3://nyc-tlc/trip data/yellow_tripdata_2012-01.csv",
nrows=100_000,
)
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@avriiil
avriiil / download-kaggle.py
Created November 14, 2022 15:19
Download Kaggle NYC bike data for Dask benchmarking
import json
from pathlib import Path
import duckdb
import kaggle
from loguru import logger
from tqdm import tqdm
# In order to access this data, you must create a Kaggle account and obtain an API key.
# You can obtain a key by clicking on your icon on the upper right of the homepage,