AI HUB aihubprojects

## vis.py
data = pd.read_csv("./data.csv")   #importing files using pandas
data.head(10)    #shows top 10 dataset items

## imp.py
#importing required libraries & dataset for the project
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

## tf2.py
text = ["The quick quick quick fox jumped over a big dog"]

# encode document
vector = vectorizer.transform(text)

# summarize encoded vector
print(vector.shape)
print(vector.toarray())

OUTPUT:

## tf1.py
from sklearn.feature_extraction.text import TfidfVectorizer

# list of text documents
text = ["The quick brown fox jumped over the lazy dog.",
		"The dog.",
		"The fox"]
# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab

## scBow.py
from sklearn.feature_extraction.text import CountVectorizer

# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]

# create the transform
vectorizer = CountVectorizer()

# tokenize and build vocab
vectorizer.fit(text)

## bow.py
#BOW ENCODING
one_hot_encoding = []
for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for word in data_corpus:
        count = split.count(word)
        if word in split:
            row_encoding.append(count)
        else:

## bin.py
#BINARY BOW
one_hot_encoding = []
for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for word in data_corpus:
        if word in split:
            row_encoding.append(1)
        else:
            row_encoding.append(0)

## ib1.py
res = len(max(document_corpus, key = len).split(" "))
index_based_encoding=[]
for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for i in range(res):
        if i <= len(split)-1:
            row_encoding.append(data_corpus.index(split[i])+1)
        else:
            row_encoding.append(0)

## ib.py
document_corpus = ["this is good phone phone" ,
                   "this is bad mobile mobile" ,
                   "she is good good cat" ,
                   "he has bad temper temper" ,
                   "this mobile phone phone is not good good"]
data_corpus = set()
for row in document_corpus:
    for word in row.split(" "):
        if word not in data_corpus:
            data_corpus.add(word)

## lancaster.py
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer

# stemming of words
lancaster = LancasterStemmer()
stemmed = [lancaster.stem(word) for word in words]
print(stemmed[:100])

Output:
  ['albert', 'einstein', 'wid', 'celebr', 'on', 'bril', 'sci', 'ev', 'liv']
	data = pd.read_csv("./data.csv") #importing files using pandas
	data.head(10) #shows top 10 dataset items
	#importing required libraries & dataset for the project
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn import datasets

	from sklearn import preprocessing
	from sklearn.preprocessing import StandardScaler
	from sklearn.preprocessing import MinMaxScaler
	text = ["The quick quick quick fox jumped over a big dog"]

	# encode document
	vector = vectorizer.transform(text)

	# summarize encoded vector
	print(vector.shape)
	print(vector.toarray())

	OUTPUT:
	from sklearn.feature_extraction.text import TfidfVectorizer

	# list of text documents
	text = ["The quick brown fox jumped over the lazy dog.",
	"The dog.",
	"The fox"]
	# create the transform
	vectorizer = TfidfVectorizer()

	# tokenize and build vocab
	#BOW ENCODING
	one_hot_encoding = []
	for row in document_corpus:
	row_encoding = []
	split = row.split(" ")
	for word in data_corpus:
	count = split.count(word)
	if word in split:
	row_encoding.append(count)
	else:
	#BINARY BOW
	one_hot_encoding = []
	for row in document_corpus:
	row_encoding = []
	split = row.split(" ")
	for word in data_corpus:
	if word in split:
	row_encoding.append(1)
	else:
	row_encoding.append(0)
	res = len(max(document_corpus, key = len).split(" "))
	index_based_encoding=[]
	for row in document_corpus:
	row_encoding = []
	split = row.split(" ")
	for i in range(res):
	if i <= len(split)-1:
	row_encoding.append(data_corpus.index(split[i])+1)
	else:
	row_encoding.append(0)
	document_corpus = ["this is good phone phone" ,
	"this is bad mobile mobile" ,
	"she is good good cat" ,
	"he has bad temper temper" ,
	"this mobile phone phone is not good good"]
	data_corpus = set()
	for row in document_corpus:
	for word in row.split(" "):
	if word not in data_corpus:
	data_corpus.add(word)
	from nltk.tokenize import word_tokenize
	from nltk.stem.lancaster import LancasterStemmer

	# stemming of words
	lancaster = LancasterStemmer()
	stemmed = [lancaster.stem(word) for word in words]
	print(stemmed[:100])

	Output:
	['albert', 'einstein', 'wid', 'celebr', 'on', 'bril', 'sci', 'ev', 'liv']