Skip to content

Instantly share code, notes, and snippets.

@rwilleynyc
rwilleynyc / nchs_pop_prev.csv
Created August 5, 2019 15:59
Preview of population data
Index Yearly July 1st Estimates State Age Group Gender Code Population
0 2016 New York < 1 year F 115210
1 2016 New York < 1 year M 121505
2 2016 New York 1-4 years F 457809
3 2016 New York 1-4 years M 479216
4 2016 New York 5-9 years F 556271
5 2016 New York 5-9 years M 581561
6 2016 New York 10-14 years F 561504
7 2016 New York 10-14 years M 585735
8 2016 New York 15-19 years F 606472
@rwilleynyc
rwilleynyc / nchs_data_prev.csv
Last active August 5, 2019 15:34
Preview of source data
Index Year Cause_Name State Deaths
0 2016 Alzheimer's disease New York 3349
1 2016 CLRD New York 6860
2 2016 Cancer New York 35368
3 2016 Diabetes New York 4038
4 2016 Heart disease New York 44076
5 2016 Influenza and pneumonia New York 4513
6 2016 Kidney disease New York 2385
7 2016 Stroke New York 6258
8 2016 Suicide New York 1679
@rwilleynyc
rwilleynyc / nchs_data.py
Last active October 26, 2019 04:46
Get CDC data with http get request
import pandas as pd
import io, requests
# Request data via http get, then save as CSV
url="https://data.cdc.gov/api/views/bi63-dtpu/rows.csv?accessType=DOWNLOAD"
s=requests.get(url).content
pd.read_csv(io.StringIO(s.decode('utf-8'))).to_csv('data_files/causes_of_death.csv')
@rwilleynyc
rwilleynyc / presNLP_wc_visualize.py
Last active July 31, 2019 19:34
Create word clouds for saved content.
from matplotlib.colors import LinearSegmentedColormap
colors = ["#FF0000", "#FF6347", "#DC143C", "#0000FF", "#0000CD", "#4169E1"]
cmap = LinearSegmentedColormap.from_list("mycmap", colors)
# Use vectors images for Trump & Clinton as masks
p1_mask = np.array(Image.open('clinton_silhouette.png'))
p1_wc = WordCloud(background_color='white', colormap='Blues', mask=p1_mask, max_words=500)
p1_wc.generate(p1_cloud)
p2_mask = np.array(Image.open('trump_silhouette.png'))
@rwilleynyc
rwilleynyc / presNLP_track_errors.py
Created July 31, 2019 19:30
Keep track of original text when transforming bigram data for keras.
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text
from sklearn.model_selection import train_test_split
# Convert each sample's list of bigrams into a string
bigrams_joined = []
for bigrams in bigrams_data:
bigrams_joined.append(' '.join(bigrams))
@rwilleynyc
rwilleynyc / presNLP_conf_matrix.py
Created July 31, 2019 19:25
Create a confusion matrix from top model predictions
from sklearn.metrics import confusion_matrix, f1_score
import itertools
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
@rwilleynyc
rwilleynyc / presNLP_view_results.py
Created July 31, 2019 19:17
Display benchmark, RNN, and CNN results
from operator import itemgetter
labels = [f'Benchmark: {max(scores,key=itemgetter(1))[0]}', 'CNN', 'RNN']
acc = [max(scores,key=itemgetter(1))[1], cnn_eval[1], rnn_eval[1]]
plt.figure(figsize = (15, 10))
plt.bar(labels, acc)
for i in range(3):
plt.text(labels[i], acc[i] + .02, f'{np.round(acc[i]*100,2)}%', ha='center')
plt.ylabel('Accuracy')
@rwilleynyc
rwilleynyc / presNLP_RNNgrid.py
Created July 31, 2019 19:15
RNN grid search parameters
rnn_types = ['lstm', 'gru']
units = [25, 50]
drops = [.25, .5]
densities = [100, 150]
@rwilleynyc
rwilleynyc / presNLP_createRNN.py
Last active July 31, 2019 19:15
Function to create single RNN for NLP.
def create_rnn_model(rnn_type='gru', units=50, drop=.5, density=50, lr=.001, epochs=100, batch_size=128, validation_split=.3, patience=5, verbose=0):
# Start Timer
start = datetime.datetime.now()
# Display Hyperparameter Settings
model_type = 'GRU' if rnn_type == 'gru' else 'LSTM'
print(f'Model Type:\t{model_type}\tUnits:\t{units}\tDropout Rate:\t{drop}\t\tDensity: {density}')
@rwilleynyc
rwilleynyc / presNLP_bestCNN.py
Created July 31, 2019 19:11
Train best CNN
# Create dataframe from results dictionary
cnn_results_df = pd.DataFrame.from_dict(cnn_results)
# Get row values associated with highest accuracy
best_cnn_model = cnn_results_df[cnn_results_df['Accuracy'] == cnn_results_df['Accuracy'].max()]
# Save hyperparameters to variables
cdim = best_cnn_model['Convolution Dimensions'].values[0]
ksize = int(best_cnn_model['Window Size'].values[0])
pool1 = int(best_cnn_model['Pool 1'].values[0])