public
Created

Fast File Concatenation in Python

  • Download Gist
cat.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
'''
Tests different methods of concatenating files in Python.
'''
 
from __future__ import print_function
import json,os,shutil,subprocess
import util
 
def verify(file,expected):
count = 0
with open(file) as f:
for _ in (json.loads(i) for i in util.yield_json(f)):
count += 1
if count != expected:
print("Found %d objects, but expected %d! "
"File didn't write correctly." % (count,expected))
 
@util.timed
def cat_files_py(jdir,file,op):
using = ("Readlines" if op == 1 else "Read") if op else "ShUtil"
print("Using %s, " % using,end='')
with open(file,'wb') as out:
for i in os.listdir(jdir):
with open(os.path.join(jdir,i),'rb') as f:
if op == 1:
out.writelines(f.readlines())
elif op == 2:
out.write(f.read())
else:
shutil.copyfileobj(f, out)
 
@util.timed
def cat_files_sys(jdir,file,secondary):
if os.name == 'nt':
if secondary:
print("Using Copy, ",end='')
cmd = "copy \"%s\\*\" \"%s\" 1> nul 2>&1"
else:
print("Using Type, ",end='')
cmd = "type \"%s\\*\" > \"%s\" 2> nul"
else:
if secondary:
print("Using Cat, ",end='')
cmd = "cat \"%s\"/* > \"%s\""
else:
print("Using Xargs, ",end='')
cmd = "find \"%s\" -type f -print0 | xargs -0 cat > %s"
return subprocess.call(cmd % (jdir, file), shell=True)
 
if __name__ == '__main__':
testdir = "/tmp/json"
cachefile = "delme.txt"
count = len(os.listdir(testdir))
# Python
for i in range(3):
util.clear_cache()
cat_files_py(testdir,cachefile,i)
verify(cachefile,count)
# System Calls
for i in (True,False):
util.clear_cache()
cat_files_sys(testdir,cachefile,i)
verify(cachefile,count)
seek.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
'''
Tests speed difference between seeking over many thousands of JSON files and reading one directly
'''
 
import hashlib,json,os,random,string,time
import util
 
def create_json_files(jdir,num=50000,static="Static_String"):
try:
os.makedirs(jdir)
except:
pass
for crypt in (hashlib.sha1((static+str(i)).encode('utf-8')).hexdigest() for i in range(num)):
d = dict(id=crypt,arg="Argument",arg2="AnotherArgument",time=time.time(),
text=
[''.join(random.choice(string.ascii_lowercase) for _ in range(100))
for _ in range(10)])
with open(os.path.join(jdir,crypt+".txt"),'w') as file:
json.dump(d,file,indent=1,sort_keys=True)
@util.timed
def read_files(jdir):
for f in os.listdir(jdir):
with open(os.path.join(jdir,f)) as file:
json.load(file)
 
@util.timed
def concatenate(jdir,file):
with open(file,'wb') as out:
for i in os.listdir(jdir):
with open(os.path.join(jdir,i),'rb') as f:
out.writelines(f.readlines())
 
@util.timed
def read_file(file,expected):
count = 0
with open(file) as f:
for i in util.yield_json(f):
json.loads(i)
count += 1
if count != expected:
print("Found %d objects, but expected %d! "
"File didn't write correctly." % (count,expected))
 
if __name__ == '__main__':
testdir = "/tmp/json"
cachefile = "delme.txt"
#util.clear_cache()
#create_json_files(testdir) # No need to run this more than once
util.clear_cache()
read_files(testdir)
util.clear_cache()
concatenate(testdir,cachefile)
util.clear_cache()
read_file(cachefile,len(os.listdir(testdir)))
util.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
'''
Utilities for testing file concatenation in Python.
'''
 
import subprocess,time
 
def timed(f):
def func(*args):
start = time.time()
ret = f(*args)
took = time.time() - start
print("%s took %f" % (f.__name__,took))
return ret
return func
 
def yield_json(lines):
'''Iterates over a file yeilding JSON objects. Expects the files
to be indented, such that root objects end '}' at the first index
of the line.
'''
store = []
for ln in lines:
if ln and ln[0] == '}': # End of object
store.append('}')
ret = store
store = [ln[1:]]
yield ''.join(ret)
else:
store.append(ln)
 
use_clean_cache=False
def clear_cache():
'''Attempts to clear disk caches on Linux - must be run as root'''
if use_clean_cache:
subprocess.call("sync; echo 3 > /proc/sys/vm/drop_caches", shell=True)
Timing methods of concatenating 50,000 JSON files
Windows 7 - i7 3.4GHz 16GB Ubuntu 11.10 - i7 3.4GHz 16GB
HD - cmd HD - Cygwin SSD - cmd SSD - Cygwin HD SSD HD - cold cache SSD- cold cache
Load All Files 10.851 14.876851 16.696 17.167982 1.851240 1.602277 339.545958 15.793162
Concatenate (Readlines) 4.980 8.296474 4.866 7.749443 0.618610 0.586936 338.830646 14.245564
Load Concat'ed File 0.538 3.946225 0.985 7.134408 0.829064 0.832572 0.854499 0.848297
Python: ShUtil 4.795 8.552490 4.304 7.918453 0.598918 0.608400 338.379066 14.215956
Python: Readlines 5.011 8.381479 4.900 7.959456 0.597912 0.605008 339.995610 14.182779
Python: Read 8.927 10.070576 15.003 9.380537 0.802220 0.761661 339.974461 14.460762
Shell: Copy / Cat 5.679 6.775388 7.010 5.899337 FAILED FAILED FAILED FAILED
Shell: Type / Xargs 3.448 8.527488 7.411 6.279359 0.343088 0.401653 332.400320 12.669442

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.