This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def rem_stp_wrd(tweets): | |
for s in range(0, len(tweets)): | |
word_list = tweets[s].split() | |
tweets[s] = ' '.join([i for i in word_list if i not in stop_words]) | |
return tweets |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def preprocess_tweets(tweets): | |
processed_features = [] | |
for sentence in range(0, len(tweets)): | |
processed_feature = re.sub('http\S+\s*', '',str(tweets[sentence])) #All URLs are removed | |
processed_feature = re.sub(r'\W', ' ', processed_feature ) # Remove Html character codes (i.e., &...;) | |
processed_feature = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)# Remove isolated characters. | |
processed_feature = re.sub(r'^b\s+', '', processed_feature)#Remove whitespaces | |
processed_feature = processed_feature.lower() #Convert Uppercase to Lowercase | |
processed_features.append(processed_feature) | |
return processed_features |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tweets = ['b"#AllWomanSpacewalk it\'s real!\\n@Space_Station \\n#EtobicokeNorth #CityofTO #ONPoli #cdnpoli #elxn43 https://t.co/wgdXLksTbQ"', | |
"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import nltk | |
nltk.download('stopwords') | |
from nltk.corpus import stopwords | |
stop_words = set(stopwords.words('english')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
plt.figure(figsize=(10,10)) | |
for i in range(5): | |
plt.subplot(3,2,i+1) | |
plt.scatter(X_train[:,0],X_train[:,1],c=gm.predict_proba(X_train)[:,i],cmap='viridis',marker='x') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.mixture import GaussianMixture | |
gm = GaussianMixture(n_components = 5, covariance_type = 'full', random_state=0, ) | |
gm.fit(X_train) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mu, sigma = estimateGaussian(X_train) | |
p = multivariateGaussian(X_test, mu, sigma) | |
plt.figure(figsize=(8,6)) | |
plt.scatter(X_test[:,0],X_test[:,1],marker="x",c=p,cmap='viridis'); | |
outliers = np.nonzero(p<0.001)[0] | |
plt.scatter(X_test[outliers,0],X_test[outliers,1],marker="o",facecolor="none",edgecolor="r",s=70); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.datasets.samples_generator import make_blobs | |
from sklearn.model_selection import train_test_split | |
X, y_true = make_blobs(n_samples=400, centers=5, cluster_std=0.60, random_state=1) | |
X_append, y_true_append = make_blobs(n_samples=50,centers=5, cluster_std=5,random_state=1) | |
X = np.vstack([X,X_append]) | |
y_true = np.hstack([[0 for _ in y_true], [1 for _ in y_true_append]]) | |
X = X[:, ::-1] # flip axes for better plotting | |
X_train, X_test, y_train, y_test = train_test_split(X, y_true, test_size=0.33, random_state=1, shuffle=True) | |
plt.scatter(X_train[:,0],X_train[:,1],marker="x"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
plt.figure(figsize=(8,6)) | |
plt.scatter(X[:,0],X[:,1],marker="x",c=p,cmap='viridis'); | |
# Circling of anomalies | |
outliers = np.nonzero(p<epsilon)[0] | |
plt.scatter(X[outliers,0],X[outliers,1],marker="o",facecolor="none",edgecolor="r",s=70); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
plt.figure(figsize=(8,6)) | |
plt.scatter(X[:,0],X[:,1],marker="x",c=p,cmap='viridis'); | |
plt.colorbar(); |