Skip to content

Instantly share code, notes, and snippets.

@gauravbansal98
Last active May 24, 2018 09:13
Show Gist options
  • Save gauravbansal98/7e22e613b16e5a67a1f9857b3e5d794e to your computer and use it in GitHub Desktop.
Save gauravbansal98/7e22e613b16e5a67a1f9857b3e5d794e to your computer and use it in GitHub Desktop.
def create_lexicon():
lexicon = [] #create an empty list
with open('pos.txt', 'r') as f: #as we are already in the folder where file is stored, 'r' is used as we reading the file
lines = f.readlines() #read all the lines
for line in lines:
line = line.lower() #convert all the lines into lowercase letters
line = word_tokenize(line) #split the sentence into words
lexicon += line #add each word to lexicon
#repeat the same process with negative examples
with open('neg.txt', 'r') as f:
lines = f.readlines()
for line in lines:
line = line.lower()
line = word_tokenize(line)
lexicon += line
lexicon = [lemmatizer.lemmatize(i) for i in lexicon] #lemmatize each word in the lexicon meaning convert each word to its root
# so 2 words with same root can be removed so now goes and going will be
# converted to go
w_counts = Counter(lexicon) #counts the occurance of each word
final_lexicon = []
for i in w_counts:
if 1000 > w_counts[i] > 50:
final_lexicon.append(i)
print(len(final_lexicon))
return final_lexicon
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment