This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def remove_non_ascii(words): | |
"""Remove non-ASCII characters from list of tokenized words""" | |
new_words = [] | |
for word in words: | |
new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') | |
new_words.append(new_word) | |
return new_words | |
def to_lowercase(words): | |
"""Convert all characters to lowercase from list of tokenized words""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
words = nltk.word_tokenize(sample) | |
print(words) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def replace_contractions(text): | |
"""Replace contractions in string of text""" | |
return contractions.fix(text) | |
sample = replace_contractions(sample) | |
print(sample) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def strip_html(text): | |
soup = BeautifulSoup(text, "html.parser") | |
return soup.get_text() | |
def remove_between_square_brackets(text): | |
return re.sub('\[[^]]*\]', '', text) | |
def denoise_text(text): | |
text = strip_html(text) | |
text = remove_between_square_brackets(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, string, unicodedata | |
import nltk | |
import contractions | |
import inflect | |
from bs4 import BeautifulSoup | |
from nltk import word_tokenize, sent_tokenize | |
from nltk.corpus import stopwords | |
from nltk.stem import LancasterStemmer, WordNetLemmatizer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Split full dataset into train and test sets | |
train = titanic_full[:889]; train.name='titanic_train_clean' | |
test = titanic_full[889:]; test.name='titanic_test_clean' | |
test.drop(['Survived'], axis=1, inplace=True) | |
def validate_test_split(df, validate_percent=.25, seed=42): | |
np.random.seed(seed) | |
perm = np.random.permutation(df.index) | |
m = len(df.index) | |
validate_end = int(validate_percent * m) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
titanic_full = pd.concat([titanic_full.drop('Pclass', axis=1), pd.get_dummies(titanic_full['Pclass'], prefix='Class')], axis=1) | |
titanic_full = pd.concat([titanic_full.drop('Embarked', axis=1), pd.get_dummies(titanic_full['Embarked'], prefix='Embarked')], axis=1) | |
titanic_full = pd.concat([titanic_full.drop('Title', axis=1), pd.get_dummies(titanic_full['Title'], prefix='Title')], axis=1) | |
titanic_full.name = 'titanic_full' # need to reset, as this is a copy of original df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Drop the 2 Embarked missing values | |
titanic_full.dropna(subset = ['Embarked'], inplace=True) | |
# Replace missing Fare value with mean; this value is in our test set (see further above) | |
titanic_full['Fare'].fillna((titanic_full['Fare'].mean()), inplace=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Determine if the passenger is alone or with family | |
titanic_full['Alone'] = np.where((titanic_full['SibSp']==0) & (titanic_full['Parch']==0), 1, 0) | |
# Drop SibSp & Parch | |
titanic_full.drop(['SibSp'], axis=1, inplace=True) | |
titanic_full.drop(['Parch'], axis=1, inplace=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_age(df): | |
for pclass in [1,2,3]: | |
for sex in ['male','female']: | |
for title in ['Miss','Mr','Master','Mrs','Unknown']: | |
ds = df[df['Pclass'] == pclass] | |
ds = ds[ds['Sex'] == sex] | |
ds = ds[ds['Title'] == title] | |
median = ds['Age'].median() | |
df.loc[(df['Age'].isnull()) & (df['Pclass'] == pclass) & (df['Sex'] == sex), 'Age'] = median |