Skip to content

Instantly share code, notes, and snippets.

View agastidukare's full-sized avatar

Agasti Kishor Dukare agastidukare

View GitHub Profile
def rem_stp_wrd(tweets):
for s in range(0, len(tweets)):
word_list = tweets[s].split()
tweets[s] = ' '.join([i for i in word_list if i not in stop_words])
return tweets
def preprocess_tweets(tweets):
processed_features = []
for sentence in range(0, len(tweets)):
processed_feature = re.sub('http\S+\s*', '',str(tweets[sentence])) #All URLs are removed
processed_feature = re.sub(r'\W', ' ', processed_feature ) # Remove Html character codes (i.e., &...;)
processed_feature = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)# Remove isolated characters.
processed_feature = re.sub(r'^b\s+', '', processed_feature)#Remove whitespaces
processed_feature = processed_feature.lower() #Convert Uppercase to Lowercase
processed_features.append(processed_feature)
return processed_features
tweets = ['b"#AllWomanSpacewalk it\'s real!\\n@Space_Station \\n#EtobicokeNorth #CityofTO #ONPoli #cdnpoli #elxn43 https://t.co/wgdXLksTbQ"',
"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"]
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
@agastidukare
agastidukare / mixture_pred_prob.py
Created January 4, 2020 21:06
This code will find the probability of each point being part of any of the five clusters.
plt.figure(figsize=(10,10))
for i in range(5):
plt.subplot(3,2,i+1)
plt.scatter(X_train[:,0],X_train[:,1],c=gm.predict_proba(X_train)[:,i],cmap='viridis',marker='x')
@agastidukare
agastidukare / gaussian_mixture.py
Created January 4, 2020 21:01
This code imports & fits Mixture of Gaussian to train data
from sklearn.mixture import GaussianMixture
gm = GaussianMixture(n_components = 5, covariance_type = 'full', random_state=0, )
gm.fit(X_train)
@agastidukare
agastidukare / plot_mixture.py
Created January 3, 2020 19:18
This code will plot probability plot for multiple clusters
mu, sigma = estimateGaussian(X_train)
p = multivariateGaussian(X_test, mu, sigma)
plt.figure(figsize=(8,6))
plt.scatter(X_test[:,0],X_test[:,1],marker="x",c=p,cmap='viridis');
outliers = np.nonzero(p<0.001)[0]
plt.scatter(X_test[outliers,0],X_test[outliers,1],marker="o",facecolor="none",edgecolor="r",s=70);
@agastidukare
agastidukare / mixture_data.py
Created January 3, 2020 18:51
This code will generate data with 5 clusters.
from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
X, y_true = make_blobs(n_samples=400, centers=5, cluster_std=0.60, random_state=1)
X_append, y_true_append = make_blobs(n_samples=50,centers=5, cluster_std=5,random_state=1)
X = np.vstack([X,X_append])
y_true = np.hstack([[0 for _ in y_true], [1 for _ in y_true_append]])
X = X[:, ::-1] # flip axes for better plotting
X_train, X_test, y_train, y_test = train_test_split(X, y_true, test_size=0.33, random_state=1, shuffle=True)
plt.scatter(X_train[:,0],X_train[:,1],marker="x");
@agastidukare
agastidukare / mark_outliers.py
Created January 1, 2020 20:06
This code will circle the outliers depending upon threshold value.
plt.figure(figsize=(8,6))
plt.scatter(X[:,0],X[:,1],marker="x",c=p,cmap='viridis');
# Circling of anomalies
outliers = np.nonzero(p<epsilon)[0]
plt.scatter(X[outliers,0],X[outliers,1],marker="o",facecolor="none",edgecolor="r",s=70);
@agastidukare
agastidukare / plot_prob.py
Created January 1, 2020 19:30
This code plots the distribution with probability.
plt.figure(figsize=(8,6))
plt.scatter(X[:,0],X[:,1],marker="x",c=p,cmap='viridis');
plt.colorbar();