AI HUB aihubprojects

## porter.py
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer


# stemming of words
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in words]
print(stemmed[:100])

Output:

## all.py
# clean our text

# split into words
tokens = word_tokenize(text)

# convert to lower case
tokens = [w.lower() for w in tokens]

# remove all tokens that are not alphabetic
words = [word for word in tokens if word.isalpha()]

## stopwords.py
# let's list all the stopwords for NLTK
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = stopwords.words('english')
print(stop_words)

## fil.py
# split into words
tokens = word_tokenize(text)

# remove all tokens that are not alphabetic
words = [word for word in tokens if word.isalpha()]
print(words[:100])

Output:
  ['Albert', 'Einstein', 'is', 'widely', 'celebrated', 'as', 'one', 'of', 'the', 'most', 'brilliant', 'scientists', 'who', 's', 'ever', 'lived']

## nltks.py
from nltk.tokenize import word_tokenize

# split into words
tokens = word_tokenize(text)
print(tokens[:100])

Output:
  ['Albert', 'Einstein', 'is', 'widely', 'celebrated', 'as', 'one', 'of', 'the', 'most', 'brilliant', 'scientists', 'who', '’', 's', 'ever', 'lived', '.']

## token.py
import nltk
from nltk import sent_tokenize
nltk.download('punkt')

# split into sentences
sentences = sent_tokenize(text)
for sentence in sentences:
  print(sentence)

## norm.py
# split based on words only
words = re.split(r'\W+', text)

# convert to lower case
words = [word.lower() for word in words]
print(words[:100])

Output:
  ['albert', 'einstein', 'is', 'widely', 'celebrated', 'as', 'one', 'of', 'the', 'most', 'brilliant', 'scientists', 'who', 's', 'ever', 'lived', '']

## sp2.py
import re

# split based on words only
words = re.split(r'\W+', text)
print(words[:100])

Output:
  ['Albert', 'Einstein', 'is', 'widely', 'celebrated', 'as', 'one', 'of', 'the', 'most', 'brilliant', 'scientists', 'who', 's', 'ever', 'lived', '']

## split1.py
text = 'Albert Einstein is  one of the most brilliant scientists who’s ever lived.'
# split into words by white space
words = text.split()
print(words[:100])

OUTPUT:
  ['Albert', 'Einstein', 'is', 'one', 'of', 'the', 'most', 'brilliant', 'scientists', 'who’s', 'ever', 'lived.']

## scikit_kmeans.py
from sklearn.cluster import KMeans
center_1 = np.array([1,1])
center_2 = np.array([5,5])
center_3 = np.array([8,1])

    # Generate random data and center it to the three centers
cluster_1 = np.random.randn(100,2) + center_1
cluster_2 = np.random.randn(100,2) + center_2
cluster_3 = np.random.randn(100,2) + center_3
	from nltk.tokenize import word_tokenize
	from nltk.stem.porter import PorterStemmer


	# stemming of words
	porter = PorterStemmer()
	stemmed = [porter.stem(word) for word in words]
	print(stemmed[:100])

	Output:
	# clean our text

	# split into words
	tokens = word_tokenize(text)

	# convert to lower case
	tokens = [w.lower() for w in tokens]

	# remove all tokens that are not alphabetic
	words = [word for word in tokens if word.isalpha()]
	# let's list all the stopwords for NLTK
	import nltk
	from nltk.corpus import stopwords

	nltk.download('stopwords')

	stop_words = stopwords.words('english')
	print(stop_words)
	import nltk
	from nltk import sent_tokenize
	nltk.download('punkt')

	# split into sentences
	sentences = sent_tokenize(text)
	for sentence in sentences:
	print(sentence)
	# split based on words only
	words = re.split(r'\W+', text)

	# convert to lower case
	words = [word.lower() for word in words]
	print(words[:100])

	Output:
	['albert', 'einstein', 'is', 'widely', 'celebrated', 'as', 'one', 'of', 'the', 'most', 'brilliant', 'scientists', 'who', 's', 'ever', 'lived', '']
	import re

	# split based on words only
	words = re.split(r'\W+', text)
	print(words[:100])

	Output:
	['Albert', 'Einstein', 'is', 'widely', 'celebrated', 'as', 'one', 'of', 'the', 'most', 'brilliant', 'scientists', 'who', 's', 'ever', 'lived', '']
	text = 'Albert Einstein is one of the most brilliant scientists who’s ever lived.'
	# split into words by white space
	words = text.split()
	print(words[:100])

	OUTPUT:
	['Albert', 'Einstein', 'is', 'one', 'of', 'the', 'most', 'brilliant', 'scientists', 'who’s', 'ever', 'lived.']
	from sklearn.cluster import KMeans
	center_1 = np.array([1,1])
	center_2 = np.array([5,5])
	center_3 = np.array([8,1])

	# Generate random data and center it to the three centers
	cluster_1 = np.random.randn(100,2) + center_1
	cluster_2 = np.random.randn(100,2) + center_2
	cluster_3 = np.random.randn(100,2) + center_3