Raymond Willey rwilleynyc

## nchs_pop_prev.csv

          
            Index
            Yearly July 1st Estimates
            State
            Age Group
            Gender Code
            Population

            
              0
              2016
              New York
              < 1 year
              F
              115210

            
              1
              2016
              New York
              < 1 year
              M
              121505

            
              2
              2016
              New York
              1-4 years
              F
              457809

            
              3
              2016
              New York
              1-4 years
              M
              479216

            
              4
              2016
              New York
              5-9 years
              F
              556271

            
              5
              2016
              New York
              5-9 years
              M
              581561

            
              6
              2016
              New York
              10-14 years
              F
              561504

            
              7
              2016
              New York
              10-14 years
              M
              585735

            
              8
              2016
              New York
              15-19 years
              F
              606472

## nchs_data_prev.csv

          
            Index
            Year
            Cause_Name
            State
            Deaths

            
              0
              2016
              Alzheimer's disease
              New York
              3349

            
              1
              2016
              CLRD
              New York
              6860

            
              2
              2016
              Cancer
              New York
              35368

            
              3
              2016
              Diabetes
              New York
              4038

            
              4
              2016
              Heart disease
              New York
              44076

            
              5
              2016
              Influenza and pneumonia
              New York
              4513

            
              6
              2016
              Kidney disease
              New York
              2385

            
              7
              2016
              Stroke
              New York
              6258

            
              8
              2016
              Suicide
              New York
              1679

## nchs_data.py
import pandas as pd
import io, requests

# Request data via http get, then save as CSV
url="https://data.cdc.gov/api/views/bi63-dtpu/rows.csv?accessType=DOWNLOAD"
s=requests.get(url).content
pd.read_csv(io.StringIO(s.decode('utf-8'))).to_csv('data_files/causes_of_death.csv')

## presNLP_wc_visualize.py
from matplotlib.colors import LinearSegmentedColormap
colors = ["#FF0000", "#FF6347", "#DC143C", "#0000FF", "#0000CD", "#4169E1"]
cmap = LinearSegmentedColormap.from_list("mycmap", colors)

# Use vectors images for Trump & Clinton as masks
p1_mask = np.array(Image.open('clinton_silhouette.png'))
p1_wc = WordCloud(background_color='white', colormap='Blues', mask=p1_mask, max_words=500)
p1_wc.generate(p1_cloud)

p2_mask = np.array(Image.open('trump_silhouette.png'))

## presNLP_track_errors.py
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text
from sklearn.model_selection import train_test_split

# Convert each sample's list of bigrams into a string
bigrams_joined = []

for bigrams in bigrams_data:
    bigrams_joined.append(' '.join(bigrams))

## presNLP_conf_matrix.py
from sklearn.metrics import confusion_matrix, f1_score
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.

## presNLP_view_results.py
from operator import itemgetter

labels = [f'Benchmark: {max(scores,key=itemgetter(1))[0]}', 'CNN', 'RNN']
acc = [max(scores,key=itemgetter(1))[1], cnn_eval[1], rnn_eval[1]]

plt.figure(figsize = (15, 10))
plt.bar(labels, acc)
for i in range(3):
    plt.text(labels[i], acc[i] + .02, f'{np.round(acc[i]*100,2)}%', ha='center')
plt.ylabel('Accuracy')

## presNLP_RNNgrid.py
rnn_types = ['lstm', 'gru']
units = [25, 50]
drops = [.25, .5]
densities = [100, 150]

## presNLP_createRNN.py
def create_rnn_model(rnn_type='gru', units=50, drop=.5, density=50, lr=.001, epochs=100, batch_size=128, validation_split=.3, patience=5, verbose=0):

    # Start Timer
    start = datetime.datetime.now()


    # Display Hyperparameter Settings
    model_type = 'GRU' if rnn_type == 'gru' else 'LSTM'
    print(f'Model Type:\t{model_type}\tUnits:\t{units}\tDropout Rate:\t{drop}\t\tDensity: {density}')


## presNLP_bestCNN.py
# Create dataframe from results dictionary
cnn_results_df = pd.DataFrame.from_dict(cnn_results)

# Get row values associated with highest accuracy
best_cnn_model = cnn_results_df[cnn_results_df['Accuracy'] == cnn_results_df['Accuracy'].max()]

# Save hyperparameters to variables
cdim = best_cnn_model['Convolution Dimensions'].values[0]
ksize = int(best_cnn_model['Window Size'].values[0])
pool1 = int(best_cnn_model['Pool 1'].values[0])
Index	Yearly July 1st Estimates	State	Age Group	Gender Code	Population
0	2016	New York	< 1 year	F	115210
1	2016	New York	< 1 year	M	121505
2	2016	New York	1-4 years	F	457809
3	2016	New York	1-4 years	M	479216
4	2016	New York	5-9 years	F	556271
5	2016	New York	5-9 years	M	581561
6	2016	New York	10-14 years	F	561504
7	2016	New York	10-14 years	M	585735
8	2016	New York	15-19 years	F	606472
Index	Year	Cause_Name	State	Deaths
0	2016	Alzheimer's disease	New York	3349
1	2016	CLRD	New York	6860
2	2016	Cancer	New York	35368
3	2016	Diabetes	New York	4038
4	2016	Heart disease	New York	44076
5	2016	Influenza and pneumonia	New York	4513
6	2016	Kidney disease	New York	2385
7	2016	Stroke	New York	6258
8	2016	Suicide	New York	1679
	import pandas as pd
	import io, requests

	# Request data via http get, then save as CSV
	url="https://data.cdc.gov/api/views/bi63-dtpu/rows.csv?accessType=DOWNLOAD"
	s=requests.get(url).content
	pd.read_csv(io.StringIO(s.decode('utf-8'))).to_csv('data_files/causes_of_death.csv')
	from matplotlib.colors import LinearSegmentedColormap
	colors = ["#FF0000", "#FF6347", "#DC143C", "#0000FF", "#0000CD", "#4169E1"]
	cmap = LinearSegmentedColormap.from_list("mycmap", colors)

	# Use vectors images for Trump & Clinton as masks
	p1_mask = np.array(Image.open('clinton_silhouette.png'))
	p1_wc = WordCloud(background_color='white', colormap='Blues', mask=p1_mask, max_words=500)
	p1_wc.generate(p1_cloud)

	p2_mask = np.array(Image.open('trump_silhouette.png'))
	from keras.preprocessing.sequence import pad_sequences
	from keras.preprocessing import text
	from sklearn.model_selection import train_test_split

	# Convert each sample's list of bigrams into a string
	bigrams_joined = []

	for bigrams in bigrams_data:
	bigrams_joined.append(' '.join(bigrams))
	from sklearn.metrics import confusion_matrix, f1_score
	import itertools

	def plot_confusion_matrix(cm, classes,
	normalize=False,
	title='Confusion matrix',
	cmap=plt.cm.Blues):
	"""
	This function prints and plots the confusion matrix.
	Normalization can be applied by setting `normalize=True`.
	from operator import itemgetter

	labels = [f'Benchmark: {max(scores,key=itemgetter(1))[0]}', 'CNN', 'RNN']
	acc = [max(scores,key=itemgetter(1))[1], cnn_eval[1], rnn_eval[1]]

	plt.figure(figsize = (15, 10))
	plt.bar(labels, acc)
	for i in range(3):
	plt.text(labels[i], acc[i] + .02, f'{np.round(acc[i]*100,2)}%', ha='center')
	plt.ylabel('Accuracy')
	rnn_types = ['lstm', 'gru']
	units = [25, 50]
	drops = [.25, .5]
	densities = [100, 150]
	def create_rnn_model(rnn_type='gru', units=50, drop=.5, density=50, lr=.001, epochs=100, batch_size=128, validation_split=.3, patience=5, verbose=0):

	# Start Timer
	start = datetime.datetime.now()


	# Display Hyperparameter Settings
	model_type = 'GRU' if rnn_type == 'gru' else 'LSTM'
	print(f'Model Type:\t{model_type}\tUnits:\t{units}\tDropout Rate:\t{drop}\t\tDensity: {density}')
	# Create dataframe from results dictionary
	cnn_results_df = pd.DataFrame.from_dict(cnn_results)

	# Get row values associated with highest accuracy
	best_cnn_model = cnn_results_df[cnn_results_df['Accuracy'] == cnn_results_df['Accuracy'].max()]

	# Save hyperparameters to variables
	cdim = best_cnn_model['Convolution Dimensions'].values[0]
	ksize = int(best_cnn_model['Window Size'].values[0])
	pool1 = int(best_cnn_model['Pool 1'].values[0])