Skip to content

Instantly share code, notes, and snippets.

@iliakonnov
Last active March 4, 2019 07:39
Show Gist options
  • Save iliakonnov/63cfa5b2d3abd387c8442819b7e8aba1 to your computer and use it in GitHub Desktop.
Save iliakonnov/63cfa5b2d3abd387c8442819b7e8aba1 to your computer and use it in GitHub Desktop.
Simple and fast python copier with continue feature and progressbar.
from sys import argv
from hashlib import md5
import time
import shutil
import os
import sqlite3
import progressbar # https://pypi.org/project/progressbar2/3.18.1/
def download(fromDir, toDir, copyId):
save_period = 1024 * 1024 * 100 # 100 Mb
length = 4 * 1024 * 1024 # 4 MiB chunk
result = True
dbExists = os.path.isfile(copyId + '.db')
conn = sqlite3.connect(copyId + '.db')
c = conn.cursor()
if dbExists:
files = [i[0] for i in c.execute("SELECT path FROM files WHERE status != 'C'")]
copiedCount = len([i[0] for i in c.execute("SELECT path FROM files WHERE status == 'C'")])
else:
print('Searching for files...')
files = []
lastPrint = time.time()
for dp, dn, filenames in os.walk(fromDir):
for f in filenames:
f = fromDir.join(os.path.join(dp, f).split(fromDir)[1:]).lstrip(os.sep)
files.append(f)
if time.time() - lastPrint > 2:
print(len(files), f)
lastPrint = time.time()
print('Filling database with {} files'.format(len(files)))
c.execute('CREATE TABLE files (path TEXT UNIQUE, status CHAR(1), hash TEXT)')
c.executemany("INSERT INTO files VALUES (?, 'N', NULL)", [(i,) for i in files])
c.execute('CREATE UNIQUE INDEX path_idx ON files(path)')
conn.commit()
copiedCount = 0
conn.execute('PRAGMA synchronous=OFF')
# conn.execute('PRAGMA journal_mode=OFF') # Faster, but unsafe
copied_bytes = 0
copied_cache = []
filesCount = len(files)
totalFiles = filesCount + copiedCount
for i in files:
fromPath = os.path.join(fromDir, i)
toPath = os.path.join(toDir, i)
speed = -1
try:
os.makedirs(os.path.split(toPath)[0], exist_ok=True)
exists = os.path.exists(toPath)
if exists:
print('Overwriting {fr} -> {to}...'.format(fr=fromPath, to=toPath))
else:
print('Copying {fr} -> {to}...'.format(fr=fromPath, to=toPath))
if os.path.isdir(fromPath):
print('Skipping dir!')
continue
copiedSize = 0
# https://hg.python.org/cpython/file/3.6/Lib/shutil.py#l120
match = exists
copied = md5()
with open(fromPath, 'rb') as fsrc:
if exists:
orig = md5()
with open(toPath, 'rb') as fdst:
readedSize = 0
with progressbar.ProgressBar(max_value=progressbar.UnknownLength) as bar:
start = time.time()
bar.update(readedSize)
while match:
buf = fsrc.read(length)
orig.update(buf)
buf2 = fdst.read(length)
copied.update(buf2)
if not buf and not buf2:
break
if orig.digest() != copied.digest():
match = False
readedSize += length
readTime = time.time() - start
if readTime == 0:
readSpeed = float('inf')
else:
readSpeed = (readedSize / 1024 / 1024) / readTime
else:
readSpeed = float('-inf')
if match:
print('Files matches!')
speed = float('-inf')
else:
orig = md5()
with open(toPath, 'wb') as fdst:
# https://hg.python.org/cpython/file/3.6/Lib/shutil.py#l76
with progressbar.ProgressBar(max_value=progressbar.UnknownLength) as bar:
start = time.time()
bar.update(copiedSize)
fsrc.seek(0)
while not match:
buf = fsrc.read(length)
if not buf:
break
orig.update(buf)
fdst.write(buf)
copiedSize += length
bar.update(copiedSize)
copyTime = time.time() - start
if copyTime == 0:
speed = float('inf')
else:
speed = (copiedSize / 1024 / 1024) / copyTime
if not match:
copied = md5()
hashProgress = 0
with open(toPath, 'rb') as fdst:
with progressbar.ProgressBar(max_value=copiedSize) as bar:
start = time.time()
bar.update(hashProgress)
while 1:
buf = fdst.read(length)
if not buf:
break
copied.update(buf)
hashProgress += length
bar.update(hashProgress)
copyTime = time.time() - start
if copyTime == 0:
hashSpeed = float('inf')
else:
hashSpeed = (hashProgress / 1024 / 1024) / copyTime
if orig.digest() != copied.digest():
raise Exception("Hash mismatch!")
else:
hashSpeed = float('-inf')
shutil.copystat(fromPath, toPath)
except Exception as e:
result = False
c.execute("UPDATE files SET status='E' WHERE path=?", (i,))
conn.commit()
print(e)
print('ERROR: {e}\n'.format(e=str(e)))
# raise
else:
copied_bytes += copiedSize
copied_cache.append((orig.hexdigest(), i))
copiedCount += 1
filesCount -= 1
if copied_bytes > save_period:
copied_bytes = 0
c.executemany("UPDATE files SET status='C', hash=? WHERE path=?", copied_cache)
conn.commit()
copied_cache = []
print('Done. {s:.3f} Mb/s write; {hs:.3f} Mb/s hash check; {rs:.3f} Mb/s read; ({c}/{t}: {f})\n'.format(
s=speed,
hs=hashSpeed,
rs=readSpeed,
c=copiedCount,
t=totalFiles,
f=filesCount
))
c.executemany("UPDATE files SET status='C', hash=? WHERE path=?", copied_cache)
print('All copied!')
conn.commit()
conn.close()
return result
if __name__ == "__main__":
if len(argv) == 4:
result = download(argv[1], argv[2], argv[3])
while not result:
result = download(argv[1], argv[2], argv[3])
print('No errors!')
else:
print('Use: download.py fromDir toDir copyId')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment