Skip to content

Instantly share code, notes, and snippets.

@gauravbansal98
Last active May 24, 2018 08:07
Show Gist options
  • Save gauravbansal98/1f1425fd392d897a03d823e961e4f9d2 to your computer and use it in GitHub Desktop.
Save gauravbansal98/1f1425fd392d897a03d823e961e4f9d2 to your computer and use it in GitHub Desktop.
def feature_vectors(input_file_name,lexicon,classification): #classification is [0, 1] for positive and [1, 0] for negative
#it is similar to 0 for negative and 1 for positives
featureset = [] #creating empty list
with open(input_file_name,'r') as f:
contents = f.readlines()
for l in contents[:hm_lines]: #number of lines we need to process
current_words = word_tokenize(l.lower()) #converting sentence to lowercase and then splitting it to words
current_words = [lemmatizer.lemmatize(i) for i in current_words]
features = np.zeros(len(lexicon)) #creating a feature vector equal to the length of the lexicon
for word in current_words:
if word.lower() in lexicon: #if word is present in the lexicon then we find the position of that
index_value = lexicon.index(word.lower()) #word in lexicon and make the element correspond to that word 1 in
features[index_value] += 1 #feature vector
features = list(features)
featureset.append([features,classification]) #append the feature vector with its classification into the featureset
#a thing to notice is that now featureset is a list of list where first
#element is the feature vector and the second element is classifier
return featureset
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment