Skip to content

Instantly share code, notes, and snippets.

@sandeepkunkunuru
Created January 9, 2022 05:51
Show Gist options
  • Save sandeepkunkunuru/9c7c6ed7829196bf9cff887789a42f00 to your computer and use it in GitHub Desktop.
Save sandeepkunkunuru/9c7c6ed7829196bf9cff887789a42f00 to your computer and use it in GitHub Desktop.
Match words from one file to many files
import os
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))
f1 = "<<>>"
directory = "<<>>"
with open(f1, 'r') as file1:
words1 = file1.read().lower().split()
words1 = [word for word in words1 if word not in sw]
files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
with open('outfile.csv', 'a') as output:
output.write('{}|{}|{}|{}.\n'.format("word", "count_file_1", " count_file_2", "file_2"))
for filename in files:
f = os.path.join(directory, filename)
with open(f, 'r') as file2:
words2 = file2.read().lower().split()
words = set(words1) & set(words2)
for word in words:
output.write('{}|{}|{}|{}.\n'.format(word, words1.count(word), words2.count(word), f))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment