Plegas Gerasimos makispl

## vectorization.py
# Create the corpus
corpus = training_set['SMS'].sum()

# Create the vocabulary
temp_set = set(corpus)
vocabulary = list(temp_set)

# Create a dictionary
len_training_set = len(training_set['SMS'])
word_counts_per_sms = {unique_word: [0] * len_training_set for unique_word in vocabulary}

## ds-project-organization.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                makispl
                / ds-project-organization.md
            
            
              Created
              January 7, 2021 19:54
                — forked from ericmjl/ds-project-organization.md
            
              
                How to organize your Python data science project
              
          
    How to organize your Python data science project

Having done a number of data projects over the years, and having seen a number of them up on GitHub, I've come to see that there's a wide range in terms of how "readable" a project is. I'd like to share some practices that I have come to adopt in my projects, which I hope will bring some organization to your projects.
Disclaimer: I'm hoping nobody takes this to be "the definitive guide" to organizing a data project; rather, I hope you, the reader, find useful tips that you can adapt to your own projects.
Disclaimer 2: What I’m writing below is primarily geared towards Python language users. Some ideas may be transferable to other languages; others may not be so. Please feel free to remix whatever you see here!
Disclaimer 3: I found the Cookiecutter Data Science page after finishing this blog post. Many ideas overlap here, though some directories are irrelevant in my work -- which is to

  
## Whiskey_Scrapper.py
# Make a GET request to the site
response = requests.get("http://whiskyadvocate.com/ratings-reviews/?search=&submit=+&brand_id=0&rating=0&price=0&category=1%2C3%2C4%2C6%2C51&styles_id=0&issue_id=0")

# Get the content of the response
content = response.content

# Initialize the parser, and pass in the content we grabbed earlier
parser = BeautifulSoup(content, 'html.parser')

def fetch_data(parser, *args):

## data_cleaning.py
# Read in the initial dataset
df_init = pd.read_csv('whiskey_data.csv')
display(df_init.shape)
df_init.head()

# Check for nulls
df_init.info()

# Check and drop redundant columns
for col in df_init.columns:

## groupby_category.py
# Fing the unique categories
df.category.value_counts()

# Subset the df by the category column
sngl_mlt = df[df['category'] == 'Single Malt Scotch'].copy()
blnd = df[df['category'] == 'Blended Scotch Whisky'].copy()
blnd_mlt = df[df['category'] == 'Blended Malt Scotch Whisky'].copy()

# Calculate the mean of numerical columns
df.groupby('category').agg(np.mean)

## kmeans.py

# Instantiate a KMeans model with 3 clusters, fit and predict cluster indices
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=1)
kmeans.fit_predict(data)
df_no_nuls['cluster'] = kmeans.labels_
data['cluster'] = kmeans.labels_

df_no_nuls.cluster.value_counts()

## locate_elbow.py
# Switch to a new dataframe, reduced to the rows with no nulls
df_no_nuls = df.dropna().copy()

# Subset to the numerical columns we are about to use on the ML algorithm
data = df_no_nuls[['rating', 'alcohol', 'age']].copy()

# Calculate the wcss
max_clusters = 11
wcss = list()

## logistic_regression.py
# Switch to a copy of the labeled dataframe
df_no_nuls_2 = df_no_nuls.copy()

# Randomise the df
shuffled_rows = np.random.permutation(df_no_nuls_2.index)
df_no_nuls_2 = df_no_nuls_2.loc[shuffled_rows]

# Split to train and test datasets
train = df_no_nuls_2.iloc[:int(df_no_nuls_2.shape[0]*0.8)].copy()
test = df_no_nuls_2.iloc[int(df_no_nuls_2.shape[0]*0.8):].copy().reset_index()

## mature_whiskies_price.py
display(df_num[df_num['age'] > 30]['price'].sort_values(ascending=False).head(10))

## plays_df.csv
GAME_ID,GAME_DATE,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,E_OFF_RATING,OFF_RATING,E_DEF_RATING,DEF_RATING,E_NET_RATING,NET_RATING,AST_PCT,AST_TOV,AST_RATIO,OREB_PCT,DREB_PCT,REB_PCT,TM_TOV_PCT,EFG_PCT,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE
0021700002,2017-10-17,1610612745,HOU,Houston,2772,Trevor Ariza,Trevor,F,,37:39,107.3,111.3,121.4,121.3,-14.1,-10.0,0.152,5.0,33.3,0.023,0.156,0.079,6.7,0.444,0.444,0.109,0.111,103.83,101.99,84.99,80,0.063
0021700002,2017-10-17,1610612745,HOU,Houston,201583,Ryan Anderson,Ryan,F,,33:17,117.8,127.4,119.4,119.7,-1.7,7.7,0.031,0.0,7.7,0.079,0.2,0.127,0.0,0.542,0.542,0.141,0.141,108.25,103.84,86.53,73,0.064
0021700002,2017-10-17,1610612745,HOU,Houston,203991,Clint Capela,Clint,C,,18:10,95.4,95.1,153.0,155.0,-57.6,-59.9,0.0,0.0,0.0,0.05,0.25,0.125,16.7,0.6,0.575,0.267,0.273,107.54,107.01,89.17,41,0.092
0021700002,2017-10-17,1610612745,HOU,Houston,201935,James Harden,James,G,,36:24,117.6,126.0,126.2,124.7,-8.6,
	# Create the corpus
	corpus = training_set['SMS'].sum()

	# Create the vocabulary
	temp_set = set(corpus)
	vocabulary = list(temp_set)

	# Create a dictionary
	len_training_set = len(training_set['SMS'])
	word_counts_per_sms = {unique_word: [0] * len_training_set for unique_word in vocabulary}
	# Make a GET request to the site
	response = requests.get("http://whiskyadvocate.com/ratings-reviews/?search=&submit=+&brand_id=0&rating=0&price=0&category=1%2C3%2C4%2C6%2C51&styles_id=0&issue_id=0")

	# Get the content of the response
	content = response.content

	# Initialize the parser, and pass in the content we grabbed earlier
	parser = BeautifulSoup(content, 'html.parser')

	def fetch_data(parser, *args):
	# Read in the initial dataset
	df_init = pd.read_csv('whiskey_data.csv')
	display(df_init.shape)
	df_init.head()

	# Check for nulls
	df_init.info()

	# Check and drop redundant columns
	for col in df_init.columns:
	# Fing the unique categories
	df.category.value_counts()

	# Subset the df by the category column
	sngl_mlt = df[df['category'] == 'Single Malt Scotch'].copy()
	blnd = df[df['category'] == 'Blended Scotch Whisky'].copy()
	blnd_mlt = df[df['category'] == 'Blended Malt Scotch Whisky'].copy()

	# Calculate the mean of numerical columns
	df.groupby('category').agg(np.mean)

	# Instantiate a KMeans model with 3 clusters, fit and predict cluster indices
	kmeans = KMeans(n_clusters=3, init='k-means++', random_state=1)
	kmeans.fit_predict(data)
	df_no_nuls['cluster'] = kmeans.labels_
	data['cluster'] = kmeans.labels_

	df_no_nuls.cluster.value_counts()
	# Switch to a new dataframe, reduced to the rows with no nulls
	df_no_nuls = df.dropna().copy()

	# Subset to the numerical columns we are about to use on the ML algorithm
	data = df_no_nuls[['rating', 'alcohol', 'age']].copy()

	# Calculate the wcss
	max_clusters = 11
	wcss = list()
	# Switch to a copy of the labeled dataframe
	df_no_nuls_2 = df_no_nuls.copy()

	# Randomise the df
	shuffled_rows = np.random.permutation(df_no_nuls_2.index)
	df_no_nuls_2 = df_no_nuls_2.loc[shuffled_rows]

	# Split to train and test datasets
	train = df_no_nuls_2.iloc[:int(df_no_nuls_2.shape[0]*0.8)].copy()
	test = df_no_nuls_2.iloc[int(df_no_nuls_2.shape[0]*0.8):].copy().reset_index()
	GAME_ID,GAME_DATE,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,E_OFF_RATING,OFF_RATING,E_DEF_RATING,DEF_RATING,E_NET_RATING,NET_RATING,AST_PCT,AST_TOV,AST_RATIO,OREB_PCT,DREB_PCT,REB_PCT,TM_TOV_PCT,EFG_PCT,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE
	0021700002,2017-10-17,1610612745,HOU,Houston,2772,Trevor Ariza,Trevor,F,,37:39,107.3,111.3,121.4,121.3,-14.1,-10.0,0.152,5.0,33.3,0.023,0.156,0.079,6.7,0.444,0.444,0.109,0.111,103.83,101.99,84.99,80,0.063
	0021700002,2017-10-17,1610612745,HOU,Houston,201583,Ryan Anderson,Ryan,F,,33:17,117.8,127.4,119.4,119.7,-1.7,7.7,0.031,0.0,7.7,0.079,0.2,0.127,0.0,0.542,0.542,0.141,0.141,108.25,103.84,86.53,73,0.064
	0021700002,2017-10-17,1610612745,HOU,Houston,203991,Clint Capela,Clint,C,,18:10,95.4,95.1,153.0,155.0,-57.6,-59.9,0.0,0.0,0.0,0.05,0.25,0.125,16.7,0.6,0.575,0.267,0.273,107.54,107.01,89.17,41,0.092
	0021700002,2017-10-17,1610612745,HOU,Houston,201935,James Harden,James,G,,36:24,117.6,126.0,126.2,124.7,-8.6,