-
-
Save jimparis/3901942 to your computer and use it in GitHub Desktop.
Using FALLOC_FL_PUNCH_HOLE from Python to punch holes in files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import ctypes | |
import ctypes.util | |
c_off_t = ctypes.c_int64 | |
def make_fallocate(): | |
libc_name = ctypes.util.find_library('c') | |
libc = ctypes.CDLL(libc_name) | |
_fallocate = libc.fallocate | |
_fallocate.restype = ctypes.c_int | |
_fallocate.argtypes = [ctypes.c_int, ctypes.c_int, c_off_t, c_off_t] | |
del libc | |
del libc_name | |
def fallocate(fd, mode, offset, len_): | |
res = _fallocate(fd.fileno(), mode, offset, len_) | |
if res != 0: | |
raise IOError(res, 'fallocate') | |
return fallocate | |
fallocate = make_fallocate() | |
del make_fallocate | |
FALLOC_FL_KEEP_SIZE = 0x01 | |
FALLOC_FL_PUNCH_HOLE = 0x02 | |
def punch(filename, verbose): | |
blocksize = 4096 | |
if verbose: | |
print "processing", filename | |
with open(filename, 'r+') as f: | |
offset = 0 | |
length = 0 | |
while True: | |
buf = f.read(blocksize) | |
if not buf: | |
break | |
for c in buf: | |
if c != '\x00': | |
break | |
else: | |
if verbose: | |
print "punching hole at offset", offset, "length", len(buf) | |
fallocate(f, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, | |
offset, len(buf)) | |
offset = offset + blocksize | |
if __name__ == '__main__': | |
import sys | |
import argparse | |
parser = argparse.ArgumentParser( | |
description = "Punch out the empty areas in a file, making it sparse") | |
parser.add_argument('file', metavar='FILE', | |
help='file(s) to modify in-place', nargs='+') | |
parser.add_argument('-v', '--verbose', action="store_true", default=False, | |
help='be verbose') | |
args = parser.parse_args() | |
for filename in args.file: | |
punch(filename, args.verbose) |
version with
buff == "\x00" * blocksize
might be even faster and does not require re module
@jkortus is mostly right - comparing against a null block does indeed seem to be the fastest method in my own tests. But what jk wrote allocates a null block every time it compares (not efficient). You need to create the null block once and keep comparing against it:
null_block = "\x00" * blocksize
...
...
if buff == null_block:
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Using a regex /^\s*$/ instead of looping through each character produced a 10 fold speed increase on my box. (see my fork)