Skip to content

Instantly share code, notes, and snippets.

View aihubprojects's full-sized avatar
🏠
Working from home

AI HUB aihubprojects

🏠
Working from home
View GitHub Profile
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
# stemming of words
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in words]
print(stemmed[:100])
Output:
# clean our text
# split into words
tokens = word_tokenize(text)
# convert to lower case
tokens = [w.lower() for w in tokens]
# remove all tokens that are not alphabetic
words = [word for word in tokens if word.isalpha()]
# let's list all the stopwords for NLTK
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
print(stop_words)
# split into words
tokens = word_tokenize(text)
# remove all tokens that are not alphabetic
words = [word for word in tokens if word.isalpha()]
print(words[:100])
Output:
['Albert', 'Einstein', 'is', 'widely', 'celebrated', 'as', 'one', 'of', 'the', 'most', 'brilliant', 'scientists', 'who', 's', 'ever', 'lived']
from nltk.tokenize import word_tokenize
# split into words
tokens = word_tokenize(text)
print(tokens[:100])
Output:
['Albert', 'Einstein', 'is', 'widely', 'celebrated', 'as', 'one', 'of', 'the', 'most', 'brilliant', 'scientists', 'who', '’', 's', 'ever', 'lived', '.']
import nltk
from nltk import sent_tokenize
nltk.download('punkt')
# split into sentences
sentences = sent_tokenize(text)
for sentence in sentences:
print(sentence)
# split based on words only
words = re.split(r'\W+', text)
# convert to lower case
words = [word.lower() for word in words]
print(words[:100])
Output:
['albert', 'einstein', 'is', 'widely', 'celebrated', 'as', 'one', 'of', 'the', 'most', 'brilliant', 'scientists', 'who', 's', 'ever', 'lived', '']
import re
# split based on words only
words = re.split(r'\W+', text)
print(words[:100])
Output:
['Albert', 'Einstein', 'is', 'widely', 'celebrated', 'as', 'one', 'of', 'the', 'most', 'brilliant', 'scientists', 'who', 's', 'ever', 'lived', '']
text = 'Albert Einstein is one of the most brilliant scientists who’s ever lived.'
# split into words by white space
words = text.split()
print(words[:100])
OUTPUT:
['Albert', 'Einstein', 'is', 'one', 'of', 'the', 'most', 'brilliant', 'scientists', 'who’s', 'ever', 'lived.']
from sklearn.cluster import KMeans
center_1 = np.array([1,1])
center_2 = np.array([5,5])
center_3 = np.array([8,1])
# Generate random data and center it to the three centers
cluster_1 = np.random.randn(100,2) + center_1
cluster_2 = np.random.randn(100,2) + center_2
cluster_3 = np.random.randn(100,2) + center_3