Matthew Mayo mmmayo13

## text_data_preprocessing_5.py
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""

## text_data_preprocessing_4.py
words = nltk.word_tokenize(sample)
print(words)

## text_data_preprocessing_3.py
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

sample = replace_contractions(sample)
print(sample)

## text_data_preprocessing_2.py
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)

## text_data_preprocessing_1.py
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

## titanic-prep-12.py
# Split full dataset into train and test sets
train = titanic_full[:889]; train.name='titanic_train_clean'
test = titanic_full[889:]; test.name='titanic_test_clean'
test.drop(['Survived'], axis=1, inplace=True)

def validate_test_split(df, validate_percent=.25, seed=42):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    validate_end = int(validate_percent * m)

## titanic-prep-11.py
titanic_full = pd.concat([titanic_full.drop('Pclass', axis=1), pd.get_dummies(titanic_full['Pclass'], prefix='Class')], axis=1)
titanic_full = pd.concat([titanic_full.drop('Embarked', axis=1), pd.get_dummies(titanic_full['Embarked'], prefix='Embarked')], axis=1)
titanic_full = pd.concat([titanic_full.drop('Title', axis=1), pd.get_dummies(titanic_full['Title'], prefix='Title')], axis=1)
titanic_full.name = 'titanic_full'  # need to reset, as this is a copy of original df

## titanic-prep-10.py
# Drop the 2 Embarked missing values
titanic_full.dropna(subset = ['Embarked'], inplace=True)

# Replace missing Fare value with mean; this value is in our test set (see further above)
titanic_full['Fare'].fillna((titanic_full['Fare'].mean()), inplace=True)

## titanic-prep-9.py
# Determine if the passenger is alone or with family
titanic_full['Alone'] = np.where((titanic_full['SibSp']==0) & (titanic_full['Parch']==0), 1, 0)

# Drop SibSp & Parch
titanic_full.drop(['SibSp'], axis=1, inplace=True)
titanic_full.drop(['Parch'], axis=1, inplace=True)

## titanic-prep-8.py
def get_age(df):
    for pclass in [1,2,3]:
        for sex in ['male','female']:
            for title in ['Miss','Mr','Master','Mrs','Unknown']:
                ds = df[df['Pclass'] == pclass]
                ds = ds[ds['Sex'] == sex]
                ds = ds[ds['Title'] == title]
                median = ds['Age'].median()
                df.loc[(df['Age'].isnull()) & (df['Pclass'] == pclass) & (df['Sex'] == sex), 'Age'] = median
	def remove_non_ascii(words):
	"""Remove non-ASCII characters from list of tokenized words"""
	new_words = []
	for word in words:
	new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
	new_words.append(new_word)
	return new_words

	def to_lowercase(words):
	"""Convert all characters to lowercase from list of tokenized words"""
	def replace_contractions(text):
	"""Replace contractions in string of text"""
	return contractions.fix(text)

	sample = replace_contractions(sample)
	print(sample)
	def strip_html(text):
	soup = BeautifulSoup(text, "html.parser")
	return soup.get_text()

	def remove_between_square_brackets(text):
	return re.sub('\[[^]]*\]', '', text)

	def denoise_text(text):
	text = strip_html(text)
	text = remove_between_square_brackets(text)
	import re, string, unicodedata
	import nltk
	import contractions
	import inflect
	from bs4 import BeautifulSoup
	from nltk import word_tokenize, sent_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import LancasterStemmer, WordNetLemmatizer
	# Split full dataset into train and test sets
	train = titanic_full[:889]; train.name='titanic_train_clean'
	test = titanic_full[889:]; test.name='titanic_test_clean'
	test.drop(['Survived'], axis=1, inplace=True)

	def validate_test_split(df, validate_percent=.25, seed=42):
	np.random.seed(seed)
	perm = np.random.permutation(df.index)
	m = len(df.index)
	validate_end = int(validate_percent * m)
	titanic_full = pd.concat([titanic_full.drop('Pclass', axis=1), pd.get_dummies(titanic_full['Pclass'], prefix='Class')], axis=1)
	titanic_full = pd.concat([titanic_full.drop('Embarked', axis=1), pd.get_dummies(titanic_full['Embarked'], prefix='Embarked')], axis=1)
	titanic_full = pd.concat([titanic_full.drop('Title', axis=1), pd.get_dummies(titanic_full['Title'], prefix='Title')], axis=1)
	titanic_full.name = 'titanic_full' # need to reset, as this is a copy of original df
	# Drop the 2 Embarked missing values
	titanic_full.dropna(subset = ['Embarked'], inplace=True)

	# Replace missing Fare value with mean; this value is in our test set (see further above)
	titanic_full['Fare'].fillna((titanic_full['Fare'].mean()), inplace=True)
	# Determine if the passenger is alone or with family
	titanic_full['Alone'] = np.where((titanic_full['SibSp']==0) & (titanic_full['Parch']==0), 1, 0)

	# Drop SibSp & Parch
	titanic_full.drop(['SibSp'], axis=1, inplace=True)
	titanic_full.drop(['Parch'], axis=1, inplace=True)
	def get_age(df):
	for pclass in [1,2,3]:
	for sex in ['male','female']:
	for title in ['Miss','Mr','Master','Mrs','Unknown']:
	ds = df[df['Pclass'] == pclass]
	ds = ds[ds['Sex'] == sex]
	ds = ds[ds['Title'] == title]
	median = ds['Age'].median()
	df.loc[(df['Age'].isnull()) & (df['Pclass'] == pclass) & (df['Sex'] == sex), 'Age'] = median