Matthew Mayo mmmayo13

## pipeline-3.py
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## centroid_initialization.py
from math import sqrt, floor
import numpy as np


def random(ds, k, random_state=42):
    """
    Create random cluster centroids.

    Parameters
    ----------

## text_data_preprocessing_1.py
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

## ml-workflows-2-4.py
import numpy as np

def cluster(ds, k):

    '''
    The k-means clustering algorithm

    Parameters:
    -----------
    ds: ndarray

## text_data_preprocessing_5.py
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""

## standard-dev-outliers.py
import numpy

arr = [10, 386, 479, 627, 20, 523, 482, 483, 542, 699, 535, 617, 577, 471, 615, 583, 441, 562, 563, 527, 453, 530, 433, 541, 585, 704, 443, 569, 430, 637, 331, 511, 552, 496, 484, 566, 554, 472, 335, 440, 579, 341, 545, 615, 548, 604, 439, 556, 442, 461, 624, 611, 444, 578, 405, 487, 490, 496, 398, 512, 422, 455, 449, 432, 607, 679, 434, 597, 639, 565, 415, 486, 668, 414, 665, 763, 557, 304, 404, 454, 689, 610, 483, 441, 657, 590, 492, 476, 437, 483, 529, 363, 711, 543]

elements = numpy.array(arr)

mean = numpy.mean(elements, axis=0)
sd = numpy.std(elements, axis=0)

final_list = [x for x in arr if (x > mean - 2 * sd)]

## lr-np.py
import numpy as np
import matplotlib.pyplot as plt

# Random data
N = 10
M = 2
input = np.random.random((N,M))
print input

# Setup matrices

## pipelines-1.py
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree

## nyc-taxi.sql
SELECT
CASE WHEN TipPercentage < 0 THEN 'No Tip'
WHEN TipPercentage BETWEEN 0 AND 5 THEN 'Less but still a Tip'
WHEN TipPercentage BETWEEN 5 AND 10 THEN 'Decent Tip'
WHEN TipPercentage > 10 THEN 'Good Tip'
ELSE 'Something different'
END AS TipRange,
Hr,
Wk,
TripMonth,

## vocabulary.py
class Vocabulary:
    PAD_token = 0   # Used for padding short sentences
    SOS_token = 1   # Start-of-sentence token
    EOS_token = 2   # End-of-sentence token

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
	from sklearn.datasets import load_iris
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.decomposition import PCA
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import GridSearchCV
	from sklearn.metrics import accuracy_score
	from sklearn.externals import joblib
	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier
	from math import sqrt, floor
	import numpy as np


	def random(ds, k, random_state=42):
	"""
	Create random cluster centroids.

	Parameters
	----------
	import re, string, unicodedata
	import nltk
	import contractions
	import inflect
	from bs4 import BeautifulSoup
	from nltk import word_tokenize, sent_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import LancasterStemmer, WordNetLemmatizer
	import numpy as np

	def cluster(ds, k):

	'''
	The k-means clustering algorithm

	Parameters:
	-----------
	ds: ndarray
	def remove_non_ascii(words):
	"""Remove non-ASCII characters from list of tokenized words"""
	new_words = []
	for word in words:
	new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
	new_words.append(new_word)
	return new_words

	def to_lowercase(words):
	"""Convert all characters to lowercase from list of tokenized words"""
	import numpy

	arr = [10, 386, 479, 627, 20, 523, 482, 483, 542, 699, 535, 617, 577, 471, 615, 583, 441, 562, 563, 527, 453, 530, 433, 541, 585, 704, 443, 569, 430, 637, 331, 511, 552, 496, 484, 566, 554, 472, 335, 440, 579, 341, 545, 615, 548, 604, 439, 556, 442, 461, 624, 611, 444, 578, 405, 487, 490, 496, 398, 512, 422, 455, 449, 432, 607, 679, 434, 597, 639, 565, 415, 486, 668, 414, 665, 763, 557, 304, 404, 454, 689, 610, 483, 441, 657, 590, 492, 476, 437, 483, 529, 363, 711, 543]

	elements = numpy.array(arr)

	mean = numpy.mean(elements, axis=0)
	sd = numpy.std(elements, axis=0)

	final_list = [x for x in arr if (x > mean - 2 * sd)]
	import numpy as np
	import matplotlib.pyplot as plt

	# Random data
	N = 10
	M = 2
	input = np.random.random((N,M))
	print input

	# Setup matrices
	SELECT
	CASE WHEN TipPercentage < 0 THEN 'No Tip'
	WHEN TipPercentage BETWEEN 0 AND 5 THEN 'Less but still a Tip'
	WHEN TipPercentage BETWEEN 5 AND 10 THEN 'Decent Tip'
	WHEN TipPercentage > 10 THEN 'Good Tip'
	ELSE 'Something different'
	END AS TipRange,
	Hr,
	Wk,
	TripMonth,
	class Vocabulary:
	PAD_token = 0 # Used for padding short sentences
	SOS_token = 1 # Start-of-sentence token
	EOS_token = 2 # End-of-sentence token

	def __init__(self, name):
	self.name = name
	self.word2index = {}
	self.word2count = {}
	self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}