Skip to content

Instantly share code, notes, and snippets.

View aihubprojects's full-sized avatar
🏠
Working from home

AI HUB aihubprojects

🏠
Working from home
View GitHub Profile
data = pd.read_csv("./data.csv") #importing files using pandas
data.head(10) #shows top 10 dataset items
#importing required libraries & dataset for the project
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
text = ["The quick quick quick fox jumped over a big dog"]
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())
OUTPUT:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.",
"The dog.",
"The fox"]
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
#BOW ENCODING
one_hot_encoding = []
for row in document_corpus:
row_encoding = []
split = row.split(" ")
for word in data_corpus:
count = split.count(word)
if word in split:
row_encoding.append(count)
else:
#BINARY BOW
one_hot_encoding = []
for row in document_corpus:
row_encoding = []
split = row.split(" ")
for word in data_corpus:
if word in split:
row_encoding.append(1)
else:
row_encoding.append(0)
res = len(max(document_corpus, key = len).split(" "))
index_based_encoding=[]
for row in document_corpus:
row_encoding = []
split = row.split(" ")
for i in range(res):
if i <= len(split)-1:
row_encoding.append(data_corpus.index(split[i])+1)
else:
row_encoding.append(0)
document_corpus = ["this is good phone phone" ,
"this is bad mobile mobile" ,
"she is good good cat" ,
"he has bad temper temper" ,
"this mobile phone phone is not good good"]
data_corpus = set()
for row in document_corpus:
for word in row.split(" "):
if word not in data_corpus:
data_corpus.add(word)
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
# stemming of words
lancaster = LancasterStemmer()
stemmed = [lancaster.stem(word) for word in words]
print(stemmed[:100])
Output:
['albert', 'einstein', 'wid', 'celebr', 'on', 'bril', 'sci', 'ev', 'liv']