Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vidit0210/c2f74323c8d2096729000f98ffbee4ac to your computer and use it in GitHub Desktop.
Save vidit0210/c2f74323c8d2096729000f98ffbee4ac to your computer and use it in GitHub Desktop.
---
Missing data - rows
---
# Check how many values are missing in the category_desc column
print(volunteer["category_desc"].isnull().sum())
# Subset the volunteer dataset
volunteer_subset = volunteer[volunteer["category_desc"].notnull()]
# Print out the shape of the subset
print(volunteer_subset.shape)
-----
Converting a column type
-----
# Print the head of the hits column
print(volunteer["hits"].head())
# Convert the hits column to type int
volunteer["hits"] = volunteer["hits"].astype("int")
# Look at the dtypes of the dataset
print(volunteer.dtypes)
------
Stratified sampling
------
# Create a data with all columns except category_desc
volunteer_X = volunteer.drop("category_desc", axis=1)
# Create a category_desc labels dataset
volunteer_y = volunteer[["category_desc"]]
# Use stratified sampling to split up the dataset according to the volunteer_y dataset
X_train, X_test, y_train, y_test = train_test_split(volunteer_X, volunteer_y, stratify=volunteer_y)
# Print out the category_desc counts on the training y labels
print(y_train["category_desc"].value_counts())
-----
Log Normalization
-----
# Print out the variance of the Proline column
print(wine["Proline"].var())
# Apply the log normalization function to the Proline column
wine["Proline_log"] = np.log(wine["Proline"])
# Check the variance of the normalized Proline column
print(wine["Proline_log"].var())
-----
Scaling data - standardizing columns
-----
# Import StandardScaler from scikit-learn
from sklearn.preprocessing import StandardScaler
# Create the scaler
ss = StandardScaler()
# Take a subset of the DataFrame you want to scale
wine_subset = wine[["Ash", "Alcalinity of ash", "Magnesium"]]
# Apply the scaler to the DataFrame subset
wine_subset_scaled = ss.fit_transform(wine_subset)
------
LabelEncoder
------
# Set up the LabelEncoder object
enc = LabelEncoder()
# Apply the encoding to the "Accessible" column
hiking["Accessible_enc"] = enc.fit_transform(hiking["Accessible"])
# Compare the two columns
print(hiking[["Accessible_enc", "Accessible"]].head())
-----
Encoding categorical variables - one-hot
----
# Transform the category_desc column
category_enc = pd.get_dummies(volunteer["category_desc"])
# Take a look at the encoded columns
print(category_enc.head())
------
Engineering numerical features - taking an average
-------
# Create a list of the columns to average
run_columns = ["run1", "run2", "run3", "run4", "run5"]
# Use apply to create a mean column
running_times_5k["mean"] = running_times_5k.apply(lambda row: row[run_columns].mean(), axis=1)
# Take a look at the results
print(running_times_5k)
-------
Engineering numerical features - datetime
-------
# First, convert string column to date column
volunteer["start_date_converted"] = pd.to_datetime(volunteer["start_date_date"])
# Extract just the month from the converted column
volunteer["start_date_month"] = volunteer["start_date_converted"].apply(lambda row: row.month)
# Take a look at the converted and new month columns
print(volunteer[["start_date_converted", "start_date_month"]].head())
-----------
Engineering features from strings - extraction
-----------
# Write a pattern to extract numbers and decimals
def return_mileage(length):
pattern = re.compile(r"\d+\.\d+")
# Search the text for matches
mile = re.match(pattern, length)
# If a value is returned, use group(0) to return the found value
if mile is not None:
return float(mile.group(0))
# Apply the function to the Length column and take a look at both columns
hiking["Length_num"] = hiking["Length"].apply(lambda row: return_mileage(row))
print(hiking[["Length", "Length_num"]].head())
------
Engineering features from strings - tf/idf
------
# Take the title text
title_text = volunteer["title"]
# Create the vectorizer method
tfidf_vec = TfidfVectorizer()
# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)
-------
Text classification using tf/idf vectors
------
# Split the dataset according to the class distribution of category_desc
y = volunteer["category_desc"]
X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y)
# Fit the model to the training data
nb.fit(X_train, y_train)
# Print out the model's accuracy
print(nb.score(X_test, y_test))
-----
Exploring text vectors, part 1
------
# Add in the rest of the parameters
def return_weights(vocab, original_vocab, vector, vector_index, top_n):
zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))
# Let's transform that zipped dict into a series
zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})
# Let's sort the series to pull out the top n weighted words
zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
return [original_vocab[i] for i in zipped_index]
# Print out the weighted words
print(return_weights(vocab, tfidf_vec.vocabulary_, text_tfidf, 8, 3))
------
Exploring text vectors, part 2
-----
def words_to_filter(vocab, original_vocab, vector, top_n):
filter_list = []
for i in range(0, vector.shape[0]):
# Here we'll call the function from the previous exercise, and extend the list we're creating
filtered = return_weights(vocab, original_vocab, vector, i, top_n)
filter_list.extend(filtered)
# Return the list in a set, so we don't get duplicate word indices
return set(filter_list)
# Call the function to get the list of word indices
filtered_words = words_to_filter(vocab, tfidf_vec.vocabulary_, text_tfidf, 3)
# By converting filtered_words back to a list, we can use it to filter the columns in the text vector
filtered_text = text_tfidf[:, list(filtered_words)]
----
# Split the dataset according to the class distribution of category_desc
train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y)
# Fit the model to the training data
nb.fit(train_X, train_y)
# Print out the model's accuracy
print(nb.score(test_X, test_y))
----
Using PCA
-----
from sklearn.decomposition import PCA
# Set up PCA and the X vector for diminsionality reduction
pca = PCA()
wine_X = wine.drop("Type", axis=1)
# Apply PCA to the wine dataset
transformed_X = pca.fit_transform(wine_X)
# Look at the percentage of variance explained by the different components
print(pca.explained_variance_ratio_)
------
UFO PROJECT
------
Checking column types
----
# Check the column types
print(ufo.dtypes)
# Change the type of seconds to float
ufo["seconds"] = ufo["seconds"].astype(float)
# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo["date"])
# Check the column types
print(ufo[["seconds", "date"]].dtypes)
---------
Dropping missing data
-------
# Check how many values are missing in the length_of_time, state, and type columns
print(ufo[["length_of_time", "state", "type"]].isnull().sum())
# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo[ufo["length_of_time"].notnull() &
ufo["state"].notnull() &
ufo["type"].notnull()]
# Print out the shape of the new dataset
print(ufo_no_missing.shape)
------
Extracting numbers from strings
------
def return_minutes(time_string):
# We'll use \d+ to grab digits and match it to the column values
pattern = re.compile(r"\d+")
# Use match on the pattern and column
num = re.match(pattern, time_string)
if num is not None:
return int(num.group(0))
# Apply the extraction to the length_of_time column
ufo["minutes"] = ufo["length_of_time"].apply(return_minutes)
# Take a look at the head of both of the columns
print(ufo[["length_of_time", "minutes"]].head())
-------
Identifying features for standardization
-----
# Check the variance of the seconds and minutes columns
print(ufo[["seconds", "minutes"]].var())
# Log normalize the seconds column
ufo["seconds_log"] = np.log(ufo["seconds"])
# Print out the variance of just the seconds_log column
print(ufo["seconds_log"].var())
-------
Encoding categorical variables
-----
# Use Pandas to encode us values as 1 and others as 0
ufo["country_enc"] = ufo["country"].apply(lambda val: 1 if val == "us" else 0)
# Print the number of unique type values
print(len(ufo["type"].unique()))
# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo["type"])
# Concatenate this set back to the ufo DataFrame
ufo = pd.concat([ufo, type_set], axis=1)
--------
Features from dates
--------
# Look at the first 5 rows of the date column
print(ufo["date"].head())
# Extract the month from the date column
ufo["month"] = ufo["date"].apply(lambda row: row.month)
# Extract the year from the date column
ufo["year"] = ufo["date"].apply(lambda row: row.year)
# Take a look at the head of all three columns
print(ufo[["date", "month", "year"]].head())
---------
Text vectorization
-------
# Take a look at the head of the desc field
print(ufo["desc"].head())
# Create the tfidf vectorizer object
vec = TfidfVectorizer()
# Use vec's fit_transform method on the desc field
desc_tfidf = vec.fit_transform(ufo["desc"])
# Look at the number of columns this creates.
print(desc_tfidf.shape)
-------
Selecting the ideal dataset
------
# Check the correlation between the seconds, seconds_log, and minutes columns
print(ufo[["seconds", "seconds_log", "minutes"]].corr())
# Make a list of features to drop
to_drop = ["city", "country", "date", "desc", "lat", "length_of_time", "long", "minutes", "recorded", "seconds", "state"]
# Drop those features
ufo_dropped = ufo.drop(to_drop, axis=1)
# Let's also filter some words out of the text vector we created
filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)
------
Modeling the UFO dataset, part 1
------
# Take a look at the features in the X set of data
print(X.columns)
# Split the X and y sets using train_test_split, setting stratify=y
train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y)
# Fit knn to the training sets
knn.fit(train_X, train_y)
# Print the score of knn on the test sets
print(knn.score(test_X, test_y))
------
Modeling the UFO dataset, part 2
-----
# Use the list of filtered words we created to filter the text vector
filtered_text = desc_tfidf[:, list(filtered_words)]
# Split the X and y sets using train_test_split, setting stratify=y
train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y)
# Fit nb to the training sets
nb.fit(train_X, train_y)
# Print the score of nb on the test sets
print(nb.score(test_X, test_y))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment