Skip to content

Instantly share code, notes, and snippets.

def remove_non_ascii(words):
"""Remove non-ASCII characters from list of tokenized words"""
new_words = []
for word in words:
new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
new_words.append(new_word)
return new_words
def to_lowercase(words):
"""Convert all characters to lowercase from list of tokenized words"""
words = nltk.word_tokenize(sample)
print(words)
def replace_contractions(text):
"""Replace contractions in string of text"""
return contractions.fix(text)
sample = replace_contractions(sample)
print(sample)
def strip_html(text):
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
def remove_between_square_brackets(text):
return re.sub('\[[^]]*\]', '', text)
def denoise_text(text):
text = strip_html(text)
text = remove_between_square_brackets(text)
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
# Split full dataset into train and test sets
train = titanic_full[:889]; train.name='titanic_train_clean'
test = titanic_full[889:]; test.name='titanic_test_clean'
test.drop(['Survived'], axis=1, inplace=True)
def validate_test_split(df, validate_percent=.25, seed=42):
np.random.seed(seed)
perm = np.random.permutation(df.index)
m = len(df.index)
validate_end = int(validate_percent * m)
titanic_full = pd.concat([titanic_full.drop('Pclass', axis=1), pd.get_dummies(titanic_full['Pclass'], prefix='Class')], axis=1)
titanic_full = pd.concat([titanic_full.drop('Embarked', axis=1), pd.get_dummies(titanic_full['Embarked'], prefix='Embarked')], axis=1)
titanic_full = pd.concat([titanic_full.drop('Title', axis=1), pd.get_dummies(titanic_full['Title'], prefix='Title')], axis=1)
titanic_full.name = 'titanic_full' # need to reset, as this is a copy of original df
# Drop the 2 Embarked missing values
titanic_full.dropna(subset = ['Embarked'], inplace=True)
# Replace missing Fare value with mean; this value is in our test set (see further above)
titanic_full['Fare'].fillna((titanic_full['Fare'].mean()), inplace=True)
# Determine if the passenger is alone or with family
titanic_full['Alone'] = np.where((titanic_full['SibSp']==0) & (titanic_full['Parch']==0), 1, 0)
# Drop SibSp & Parch
titanic_full.drop(['SibSp'], axis=1, inplace=True)
titanic_full.drop(['Parch'], axis=1, inplace=True)
def get_age(df):
for pclass in [1,2,3]:
for sex in ['male','female']:
for title in ['Miss','Mr','Master','Mrs','Unknown']:
ds = df[df['Pclass'] == pclass]
ds = ds[ds['Sex'] == sex]
ds = ds[ds['Title'] == title]
median = ds['Age'].median()
df.loc[(df['Age'].isnull()) & (df['Pclass'] == pclass) & (df['Sex'] == sex), 'Age'] = median