Skip to content

Instantly share code, notes, and snippets.

@MartinPaulEve
Last active August 29, 2015 14:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save MartinPaulEve/c0610fa89da4df4d546a to your computer and use it in GitHub Desktop.
Save MartinPaulEve/c0610fa89da4df4d546a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import re
output = []
# use a "with" block to automatically close I/O streams
with open('mylist.txt') as word_list:
# read the contents of mylist.txt into the words list using list comprehension
words = [word.strip().lower() for word in word_list]
with open('stuff.tsv') as tsv:
# read the contents of stuff.tsv into the line list using list comprehension
lines = [line for line in tsv]
# create a dictionary of compiled regular expressions for the word list
regexen = {}
for word in words:
regexen[word] = re.compile(r'\b{0}\b'.format(word))
# iterate over the lines
for line in lines:
# iterate over the word list
for word in words:
# create a regular expression using word boundaries around our word
match = regexen[word].search(line.lower())
# if we find one of the words in the line, then add it to the output list
if match:
# add the line to the output list
if line.endswith('\n'):
output.append(line)
else:
output.append('{0}\n'.format(line))
# write some debug output to the console
print('Found line {0} that matched word {1}'.format(line, word))
# exit the word while loop
break
# open output.tsv using a with block with write permissions
with open('output.tsv', 'w') as output_file:
# write the output list to the file
output_file.writelines(output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment