Agasti Kishor Dukare agastidukare

## remove_stopwords.py
def rem_stp_wrd(tweets):
    for s in range(0, len(tweets)):
        word_list = tweets[s].split()
        tweets[s] = ' '.join([i for i in word_list if i not in stop_words])
    return tweets

## preprocess_tweet.py
def preprocess_tweets(tweets):
    processed_features = []
    for sentence in range(0, len(tweets)):
        processed_feature = re.sub('http\S+\s*', '',str(tweets[sentence]))  #All URLs are removed
        processed_feature = re.sub(r'\W', ' ', processed_feature ) # Remove Html character codes (i.e., &...;)
        processed_feature = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)# Remove isolated characters.
        processed_feature = re.sub(r'^b\s+', '', processed_feature)#Remove whitespaces
        processed_feature = processed_feature.lower() #Convert Uppercase to Lowercase
        processed_features.append(processed_feature)
    return processed_features

## Tweets.py
tweets = ['b"#AllWomanSpacewalk it\'s real!\\n@Space_Station \\n#EtobicokeNorth #CityofTO #ONPoli #cdnpoli #elxn43 https://t.co/wgdXLksTbQ"',
         "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"]

## NLP_Libraries.py
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

## mixture_pred_prob.py
plt.figure(figsize=(10,10))
for i in range(5):
    plt.subplot(3,2,i+1)
    plt.scatter(X_train[:,0],X_train[:,1],c=gm.predict_proba(X_train)[:,i],cmap='viridis',marker='x')

## gaussian_mixture.py
from sklearn.mixture import GaussianMixture
gm = GaussianMixture(n_components = 5, covariance_type = 'full', random_state=0, )
gm.fit(X_train)

## plot_mixture.py
mu, sigma = estimateGaussian(X_train)
p = multivariateGaussian(X_test, mu, sigma)
plt.figure(figsize=(8,6))
plt.scatter(X_test[:,0],X_test[:,1],marker="x",c=p,cmap='viridis');
outliers = np.nonzero(p<0.001)[0]
plt.scatter(X_test[outliers,0],X_test[outliers,1],marker="o",facecolor="none",edgecolor="r",s=70);

## mixture_data.py
from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
X, y_true = make_blobs(n_samples=400, centers=5, cluster_std=0.60, random_state=1)
X_append, y_true_append = make_blobs(n_samples=50,centers=5, cluster_std=5,random_state=1)
X = np.vstack([X,X_append])
y_true = np.hstack([[0 for _ in y_true], [1 for _ in y_true_append]])
X = X[:, ::-1] # flip axes for better plotting
X_train, X_test, y_train, y_test = train_test_split(X, y_true, test_size=0.33, random_state=1, shuffle=True)
plt.scatter(X_train[:,0],X_train[:,1],marker="x");

## mark_outliers.py
plt.figure(figsize=(8,6))
plt.scatter(X[:,0],X[:,1],marker="x",c=p,cmap='viridis');
# Circling of anomalies
outliers = np.nonzero(p<epsilon)[0]
plt.scatter(X[outliers,0],X[outliers,1],marker="o",facecolor="none",edgecolor="r",s=70);

## plot_prob.py
plt.figure(figsize=(8,6))
plt.scatter(X[:,0],X[:,1],marker="x",c=p,cmap='viridis');
plt.colorbar();
	def rem_stp_wrd(tweets):
	for s in range(0, len(tweets)):
	word_list = tweets[s].split()
	tweets[s] = ' '.join([i for i in word_list if i not in stop_words])
	return tweets
	def preprocess_tweets(tweets):
	processed_features = []
	for sentence in range(0, len(tweets)):
	processed_feature = re.sub('http\S+\s*', '',str(tweets[sentence])) #All URLs are removed
	processed_feature = re.sub(r'\W', ' ', processed_feature ) # Remove Html character codes (i.e., &...;)
	processed_feature = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)# Remove isolated characters.
	processed_feature = re.sub(r'^b\s+', '', processed_feature)#Remove whitespaces
	processed_feature = processed_feature.lower() #Convert Uppercase to Lowercase
	processed_features.append(processed_feature)
	return processed_features
	tweets = ['b"#AllWomanSpacewalk it\'s real!\\n@Space_Station \\n#EtobicokeNorth #CityofTO #ONPoli #cdnpoli #elxn43 https://t.co/wgdXLksTbQ"',
	"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"]
	import re
	import nltk
	nltk.download('stopwords')
	from nltk.corpus import stopwords
	stop_words = set(stopwords.words('english'))
	plt.figure(figsize=(10,10))
	for i in range(5):
	plt.subplot(3,2,i+1)
	plt.scatter(X_train[:,0],X_train[:,1],c=gm.predict_proba(X_train)[:,i],cmap='viridis',marker='x')
	from sklearn.mixture import GaussianMixture
	gm = GaussianMixture(n_components = 5, covariance_type = 'full', random_state=0, )
	gm.fit(X_train)
	mu, sigma = estimateGaussian(X_train)
	p = multivariateGaussian(X_test, mu, sigma)
	plt.figure(figsize=(8,6))
	plt.scatter(X_test[:,0],X_test[:,1],marker="x",c=p,cmap='viridis');
	outliers = np.nonzero(p<0.001)[0]
	plt.scatter(X_test[outliers,0],X_test[outliers,1],marker="o",facecolor="none",edgecolor="r",s=70);
	from sklearn.datasets.samples_generator import make_blobs
	from sklearn.model_selection import train_test_split
	X, y_true = make_blobs(n_samples=400, centers=5, cluster_std=0.60, random_state=1)
	X_append, y_true_append = make_blobs(n_samples=50,centers=5, cluster_std=5,random_state=1)
	X = np.vstack([X,X_append])
	y_true = np.hstack([[0 for _ in y_true], [1 for _ in y_true_append]])
	X = X[:, ::-1] # flip axes for better plotting
	X_train, X_test, y_train, y_test = train_test_split(X, y_true, test_size=0.33, random_state=1, shuffle=True)
	plt.scatter(X_train[:,0],X_train[:,1],marker="x");
	plt.figure(figsize=(8,6))
	plt.scatter(X[:,0],X[:,1],marker="x",c=p,cmap='viridis');
	# Circling of anomalies
	outliers = np.nonzero(p<epsilon)[0]
	plt.scatter(X[outliers,0],X[outliers,1],marker="o",facecolor="none",edgecolor="r",s=70);
	plt.figure(figsize=(8,6))
	plt.scatter(X[:,0],X[:,1],marker="x",c=p,cmap='viridis');
	plt.colorbar();