Skip to content

Instantly share code, notes, and snippets.

@dimo414
Created June 26, 2012 04:54
Show Gist options
  • Save dimo414/2993381 to your computer and use it in GitHub Desktop.
Save dimo414/2993381 to your computer and use it in GitHub Desktop.
Fast File Concatenation in Python
'''
Tests different methods of concatenating files in Python.
'''
from __future__ import print_function
import json,os,shutil,subprocess
import util
def verify(file,expected):
count = 0
with open(file) as f:
for _ in (json.loads(i) for i in util.yield_json(f)):
count += 1
if count != expected:
print("Found %d objects, but expected %d! "
"File didn't write correctly." % (count,expected))
@util.timed
def cat_files_py(jdir,file,op):
using = ("Readlines" if op == 1 else "Read") if op else "ShUtil"
print("Using %s, " % using,end='')
with open(file,'wb') as out:
for i in os.listdir(jdir):
with open(os.path.join(jdir,i),'rb') as f:
if op == 1:
out.writelines(f.readlines())
elif op == 2:
out.write(f.read())
else:
shutil.copyfileobj(f, out)
@util.timed
def cat_files_sys(jdir,file,secondary):
if os.name == 'nt':
if secondary:
print("Using Copy, ",end='')
cmd = "copy \"%s\\*\" \"%s\" 1> nul 2>&1"
else:
print("Using Type, ",end='')
cmd = "type \"%s\\*\" > \"%s\" 2> nul"
else:
if secondary:
print("Using Cat, ",end='')
cmd = "cat \"%s\"/* > \"%s\""
else:
print("Using Xargs, ",end='')
cmd = "find \"%s\" -type f -print0 | xargs -0 cat > %s"
return subprocess.call(cmd % (jdir, file), shell=True)
if __name__ == '__main__':
testdir = "/tmp/json"
cachefile = "delme.txt"
count = len(os.listdir(testdir))
# Python
for i in range(3):
util.clear_cache()
cat_files_py(testdir,cachefile,i)
verify(cachefile,count)
# System Calls
for i in (True,False):
util.clear_cache()
cat_files_sys(testdir,cachefile,i)
verify(cachefile,count)
'''
Tests speed difference between seeking over many thousands of JSON files and reading one directly
'''
import hashlib,json,os,random,string,time
import util
def create_json_files(jdir,num=50000,static="Static_String"):
try:
os.makedirs(jdir)
except:
pass
for crypt in (hashlib.sha1((static+str(i)).encode('utf-8')).hexdigest() for i in range(num)):
d = dict(id=crypt,arg="Argument",arg2="AnotherArgument",time=time.time(),
text=
[''.join(random.choice(string.ascii_lowercase) for _ in range(100))
for _ in range(10)])
with open(os.path.join(jdir,crypt+".txt"),'w') as file:
json.dump(d,file,indent=1,sort_keys=True)
@util.timed
def read_files(jdir):
for f in os.listdir(jdir):
with open(os.path.join(jdir,f)) as file:
json.load(file)
@util.timed
def concatenate(jdir,file):
with open(file,'wb') as out:
for i in os.listdir(jdir):
with open(os.path.join(jdir,i),'rb') as f:
out.writelines(f.readlines())
@util.timed
def read_file(file,expected):
count = 0
with open(file) as f:
for i in util.yield_json(f):
json.loads(i)
count += 1
if count != expected:
print("Found %d objects, but expected %d! "
"File didn't write correctly." % (count,expected))
if __name__ == '__main__':
testdir = "/tmp/json"
cachefile = "delme.txt"
#util.clear_cache()
#create_json_files(testdir) # No need to run this more than once
util.clear_cache()
read_files(testdir)
util.clear_cache()
concatenate(testdir,cachefile)
util.clear_cache()
read_file(cachefile,len(os.listdir(testdir)))
'''
Utilities for testing file concatenation in Python.
'''
import subprocess,time
def timed(f):
def func(*args):
start = time.time()
ret = f(*args)
took = time.time() - start
print("%s took %f" % (f.__name__,took))
return ret
return func
def yield_json(lines):
'''Iterates over a file yeilding JSON objects. Expects the files
to be indented, such that root objects end '}' at the first index
of the line.
'''
store = []
for ln in lines:
if ln and ln[0] == '}': # End of object
store.append('}')
ret = store
store = [ln[1:]]
yield ''.join(ret)
else:
store.append(ln)
use_clean_cache=False
def clear_cache():
'''Attempts to clear disk caches on Linux - must be run as root'''
if use_clean_cache:
subprocess.call("sync; echo 3 > /proc/sys/vm/drop_caches", shell=True)
@dimo414
Copy link
Author

dimo414 commented Jun 27, 2012

Timing methods of concatenating 50,000 JSON files
Windows 7 - i7 3.4GHz 16GBUbuntu 11.10 - i7 3.4GHz 16GB
HD - cmdHD - CygwinSSD - cmdSSD - CygwinHDSSDHD - cold cacheSSD- cold cache
Load All Files10.85114.87685116.69617.1679821.8512401.602277339.54595815.793162
Concatenate (Readlines)4.9808.2964744.8667.7494430.6186100.586936338.83064614.245564
Load Concat'ed File0.5383.9462250.9857.1344080.8290640.8325720.8544990.848297
Python: ShUtil4.7958.5524904.3047.9184530.5989180.608400338.37906614.215956
Python: Readlines5.0118.3814794.9007.9594560.5979120.605008339.99561014.182779
Python: Read8.92710.07057615.0039.3805370.8022200.761661339.97446114.460762
Shell: Copy / Cat5.6796.7753887.0105.899337FAILEDFAILEDFAILEDFAILED
Shell: Type / Xargs3.4488.5274887.4116.2793590.3430880.401653332.40032012.669442

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment