Skip to content

Instantly share code, notes, and snippets.

@chapmanjacobd
Created January 26, 2023 20:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chapmanjacobd/8ec346a7a9f4e78547e7f06043fce9bb to your computer and use it in GitHub Desktop.
Save chapmanjacobd/8ec346a7a9f4e78547e7f06043fce9bb to your computer and use it in GitHub Desktop.
writelines() is faster than write() if your data can fit in RAM
def filter_file(path, sieve):
with open(path, 'r') as fr:
lines = fr.readlines()
with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp:
temp.writelines(l for l in lines if l.rstrip() not in sieve)
temp.flush()
os.fsync(temp.fileno())
os.replace(temp.name, path)
@>>> timeit.timeit("filter_file('/tmp/t', ['abcnewsvideo 9758031'])", number=100, setup="from __main__ import filter_file")
20.872984636982437
@>>> timeit.timeit("filter_file('/tmp/t', ['abcnewsvideo 9758031'])", number=100, setup="from __main__ import filter_file")
20.709978723025415
def filter_file(path, sieve):
with open(path, 'r') as fr:
lines = fr.readlines()
with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp:
for line in lines:
if line.rstrip() in sieve:
continue
temp.write(line)
temp.flush()
os.fsync(temp.fileno())
os.replace(temp.name, path)
@>>> timeit.timeit("filter_file('/tmp/t', ['abcnewsvideo 9758031'])", number=100, setup="from __main__ import filter_file")
29.54833430200233
@>>> timeit.timeit("filter_file('/tmp/t', ['abcnewsvideo 9758031'])", number=100, setup="from __main__ import filter_file")
29.225350995984627
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment