Avril Aysha avriiil

## gsdmm-preprocessing.py
# importing libraries
import pandas as pd
import numpy as np
import gensim
from gsdmm import MovieGroupProcess

# cast tweets to numpy array
docs = df.tweet_text.to_numpy()

# create dictionary of all words in all documents

## gsdmm-print-topics.py
# print number of documents per topic
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):

## gsdmm-coherence-score.py
# import library from gensim
from gensim.models import CoherenceModel

# define function to get words in topics
def get_topics_lists(model, top_clusters, n_words):
    '''
    Gets lists of words in topics as a list of lists.

    model: gsdmm instance
    top_clusters:  numpy array containing indices of top_clusters

## gsdmm-word-cloud.py
# Import wordcloud library
from wordcloud import WordCloud

# Get topic word distributions from gsdmm model
cluster_word_distribution = gsdmm.cluster_word_distribution

# Select topic you want to output as dictionary (using topic_number)
topic_dict = sorted(cluster_word_distribution[topic_number].items(), key=lambda k: k[1], reverse=True)[:values]

# Generate a word cloud image

## lda-coherence-score.py
# import library from gensim
from gensim.models import CoherenceModel

# instantiate topic coherence model
cm = CoherenceModel(model=lda_model_15, corpus=bow_corpus, texts=docs, coherence='c_v')

# get topic coherence score
coherence_lda = cm.get_coherence()
print(coherence_lda)

## 02_National_Water_Model.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                avriiil
                / 02_National_Water_Model.ipynb
            
            
              Created
              October 11, 2021 14:08
                — forked from rsignell-usgs/02_National_Water_Model.ipynb
            
              
                NWM demo with hvplot
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## write_large_parquet_m1
# ...spin up cluster...connect Dask...etc.

# use client.submit() to write large parquet files to S3 (to avoid blosc issues on M1)
def submit_jobs():
    from distributed import get_client

    with get_client() as client:
        large = dask.datasets.timeseries(start="2000", end="2015", freq="10s", partition_freq="1M")

        large.to_parquet(

## geopandas-explore.py
import pandas as pd
import geopandas as gpd
from geopandas import points_from_xy
from shapely.geometry import Point

# read in raw taxi data
df = pd.read_csv(
    "s3://nyc-tlc/trip data/yellow_tripdata_2012-01.csv",
    nrows=100_000,
)

## tutorial.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                avriiil
                / tutorial.ipynb
            
            
              Created
              November 2, 2022 11:38
                — forked from mrocklin/tutorial.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## download-kaggle.py
import json
from pathlib import Path

import duckdb
import kaggle
from loguru import logger
from tqdm import tqdm

# In order to access this data, you must create a Kaggle account and obtain an API key.
# You can obtain a key by clicking on your icon on the upper right of the homepage,
	# importing libraries
	import pandas as pd
	import numpy as np
	import gensim
	from gsdmm import MovieGroupProcess

	# cast tweets to numpy array
	docs = df.tweet_text.to_numpy()

	# create dictionary of all words in all documents
	# print number of documents per topic
	doc_count = np.array(gsdmm.cluster_doc_count)
	print('Number of documents per topic :', doc_count)

	# Topics sorted by the number of document they are allocated to
	top_index = doc_count.argsort()[-15:][::-1]
	print('Most important clusters (by number of docs inside):', top_index)

	# define function to get top words per topic
	def top_words(cluster_word_distribution, top_cluster, values):
	# import library from gensim
	from gensim.models import CoherenceModel

	# define function to get words in topics
	def get_topics_lists(model, top_clusters, n_words):
	'''
	Gets lists of words in topics as a list of lists.

	model: gsdmm instance
	top_clusters: numpy array containing indices of top_clusters
	# Import wordcloud library
	from wordcloud import WordCloud

	# Get topic word distributions from gsdmm model
	cluster_word_distribution = gsdmm.cluster_word_distribution

	# Select topic you want to output as dictionary (using topic_number)
	topic_dict = sorted(cluster_word_distribution[topic_number].items(), key=lambda k: k[1], reverse=True)[:values]

	# Generate a word cloud image
	# import library from gensim
	from gensim.models import CoherenceModel

	# instantiate topic coherence model
	cm = CoherenceModel(model=lda_model_15, corpus=bow_corpus, texts=docs, coherence='c_v')

	# get topic coherence score
	coherence_lda = cm.get_coherence()
	print(coherence_lda)
	# ...spin up cluster...connect Dask...etc.

	# use client.submit() to write large parquet files to S3 (to avoid blosc issues on M1)
	def submit_jobs():
	from distributed import get_client

	with get_client() as client:
	large = dask.datasets.timeseries(start="2000", end="2015", freq="10s", partition_freq="1M")

	large.to_parquet(
	import pandas as pd
	import geopandas as gpd
	from geopandas import points_from_xy
	from shapely.geometry import Point

	# read in raw taxi data
	df = pd.read_csv(
	"s3://nyc-tlc/trip data/yellow_tripdata_2012-01.csv",
	nrows=100_000,
	)
	import json
	from pathlib import Path

	import duckdb
	import kaggle
	from loguru import logger
	from tqdm import tqdm

	# In order to access this data, you must create a Kaggle account and obtain an API key.
	# You can obtain a key by clicking on your icon on the upper right of the homepage,