iliakonnov/copy.py

## copy.py
from sys import argv
from hashlib import md5
import time
import shutil
import os
import sqlite3
import progressbar  # https://pypi.org/project/progressbar2/3.18.1/


def download(fromDir, toDir, copyId):
    save_period = 1024 * 1024 * 100  # 100 Mb
    length = 4 * 1024 * 1024  # 4 MiB chunk
    result = True

    dbExists = os.path.isfile(copyId + '.db')
    conn = sqlite3.connect(copyId + '.db')
    c = conn.cursor()
    if dbExists:
        files = [i[0] for i in c.execute("SELECT path FROM files WHERE status != 'C'")]
        copiedCount = len([i[0] for i in c.execute("SELECT path FROM files WHERE status == 'C'")])
    else:
        print('Searching for files...')
        files = []
        lastPrint = time.time()
        for dp, dn, filenames in os.walk(fromDir):
            for f in filenames:
                f = fromDir.join(os.path.join(dp, f).split(fromDir)[1:]).lstrip(os.sep)
                files.append(f)
                if time.time() - lastPrint > 2:
                    print(len(files), f)
                    lastPrint = time.time()

        print('Filling database with {} files'.format(len(files)))
        c.execute('CREATE TABLE files (path TEXT UNIQUE, status CHAR(1), hash TEXT)')
        c.executemany("INSERT INTO files VALUES (?, 'N', NULL)", [(i,) for i in files])
        c.execute('CREATE UNIQUE INDEX path_idx ON files(path)')
        conn.commit()
        copiedCount = 0
    conn.execute('PRAGMA synchronous=OFF')
    # conn.execute('PRAGMA journal_mode=OFF')  # Faster, but unsafe

    copied_bytes = 0
    copied_cache = []
    filesCount = len(files)
    totalFiles = filesCount + copiedCount
    for i in files:
        fromPath = os.path.join(fromDir, i)
        toPath = os.path.join(toDir, i)
        speed = -1
        try:
            os.makedirs(os.path.split(toPath)[0], exist_ok=True)

            exists = os.path.exists(toPath)
            if exists:
                print('Overwriting {fr} -> {to}...'.format(fr=fromPath, to=toPath))
            else:
                print('Copying {fr} -> {to}...'.format(fr=fromPath, to=toPath))

            if os.path.isdir(fromPath):
                print('Skipping dir!')
                continue

            copiedSize = 0
            # https://hg.python.org/cpython/file/3.6/Lib/shutil.py#l120
            match = exists
            copied = md5()
            with open(fromPath, 'rb') as fsrc:
                if exists:
                    orig = md5()
                    with open(toPath, 'rb') as fdst:
                        readedSize = 0
                        with progressbar.ProgressBar(max_value=progressbar.UnknownLength) as bar:
                            start = time.time()
                            bar.update(readedSize)
                            while match:
                                buf = fsrc.read(length)
                                orig.update(buf)
                                buf2 = fdst.read(length)
                                copied.update(buf2)
                                if not buf and not buf2:
                                    break
                                if orig.digest() != copied.digest():
                                    match = False
                                readedSize += length
                                readTime = time.time() - start
                            if readTime == 0:
                                readSpeed = float('inf')
                            else:
                                readSpeed = (readedSize / 1024 / 1024) / readTime
                else:
                    readSpeed = float('-inf')
                if match:
                    print('Files matches!')
                    speed = float('-inf')
                else:
                    orig = md5()
                    with open(toPath, 'wb') as fdst:
                        # https://hg.python.org/cpython/file/3.6/Lib/shutil.py#l76
                        with progressbar.ProgressBar(max_value=progressbar.UnknownLength) as bar:
                            start = time.time()
                            bar.update(copiedSize)
                            fsrc.seek(0)
                            while not match:
                                buf = fsrc.read(length)
                                if not buf:
                                    break
                                orig.update(buf)
                                fdst.write(buf)
                                copiedSize += length
                                bar.update(copiedSize)
                            copyTime = time.time() - start
                            if copyTime == 0:
                                speed = float('inf')
                            else:
                                speed = (copiedSize / 1024 / 1024) / copyTime
            if not match:
                copied = md5()
                hashProgress = 0
                with open(toPath, 'rb') as fdst:
                    with progressbar.ProgressBar(max_value=copiedSize) as bar:
                        start = time.time()
                        bar.update(hashProgress)
                        while 1:
                            buf = fdst.read(length)
                            if not buf:
                                break
                            copied.update(buf)
                            hashProgress += length
                            bar.update(hashProgress)
                        copyTime = time.time() - start
                        if copyTime == 0:
                            hashSpeed = float('inf')
                        else:
                            hashSpeed = (hashProgress / 1024 / 1024) / copyTime
                if orig.digest() != copied.digest():
                    raise Exception("Hash mismatch!")
            else:
                hashSpeed = float('-inf')
            shutil.copystat(fromPath, toPath)
        except Exception as e:
            result = False
            c.execute("UPDATE files SET status='E' WHERE path=?", (i,))
            conn.commit()
            print(e)
            print('ERROR: {e}\n'.format(e=str(e)))
            # raise
        else:
            copied_bytes += copiedSize
            copied_cache.append((orig.hexdigest(), i))
            copiedCount += 1
            filesCount -= 1
            if copied_bytes > save_period:
                copied_bytes = 0
                c.executemany("UPDATE files SET status='C', hash=? WHERE path=?", copied_cache)
                conn.commit()
                copied_cache = []
            print('Done. {s:.3f} Mb/s write; {hs:.3f} Mb/s hash check; {rs:.3f} Mb/s read; ({c}/{t}: {f})\n'.format(
                s=speed,
                hs=hashSpeed,
                rs=readSpeed,
                c=copiedCount,
                t=totalFiles,
                f=filesCount
            ))
    c.executemany("UPDATE files SET status='C', hash=? WHERE path=?", copied_cache)
    print('All copied!')
    conn.commit()
    conn.close()

    return result


if __name__ == "__main__":
    if len(argv) == 4:
        result = download(argv[1], argv[2], argv[3])
        while not result:
            result = download(argv[1], argv[2], argv[3])
        print('No errors!')
    else:
        print('Use: download.py fromDir toDir copyId')
	from sys import argv
	from hashlib import md5
	import time
	import shutil
	import os
	import sqlite3
	import progressbar # https://pypi.org/project/progressbar2/3.18.1/


	def download(fromDir, toDir, copyId):
	save_period = 1024 * 1024 * 100 # 100 Mb
	length = 4 * 1024 * 1024 # 4 MiB chunk
	result = True

	dbExists = os.path.isfile(copyId + '.db')
	conn = sqlite3.connect(copyId + '.db')
	c = conn.cursor()
	if dbExists:
	files = [i[0] for i in c.execute("SELECT path FROM files WHERE status != 'C'")]
	copiedCount = len([i[0] for i in c.execute("SELECT path FROM files WHERE status == 'C'")])
	else:
	print('Searching for files...')
	files = []
	lastPrint = time.time()
	for dp, dn, filenames in os.walk(fromDir):
	for f in filenames:
	f = fromDir.join(os.path.join(dp, f).split(fromDir)[1:]).lstrip(os.sep)
	files.append(f)
	if time.time() - lastPrint > 2:
	print(len(files), f)
	lastPrint = time.time()

	print('Filling database with {} files'.format(len(files)))
	c.execute('CREATE TABLE files (path TEXT UNIQUE, status CHAR(1), hash TEXT)')
	c.executemany("INSERT INTO files VALUES (?, 'N', NULL)", [(i,) for i in files])
	c.execute('CREATE UNIQUE INDEX path_idx ON files(path)')
	conn.commit()
	copiedCount = 0
	conn.execute('PRAGMA synchronous=OFF')
	# conn.execute('PRAGMA journal_mode=OFF') # Faster, but unsafe

	copied_bytes = 0
	copied_cache = []
	filesCount = len(files)
	totalFiles = filesCount + copiedCount
	for i in files:
	fromPath = os.path.join(fromDir, i)
	toPath = os.path.join(toDir, i)
	speed = -1
	try:
	os.makedirs(os.path.split(toPath)[0], exist_ok=True)

	exists = os.path.exists(toPath)
	if exists:
	print('Overwriting {fr} -> {to}...'.format(fr=fromPath, to=toPath))
	else:
	print('Copying {fr} -> {to}...'.format(fr=fromPath, to=toPath))

	if os.path.isdir(fromPath):
	print('Skipping dir!')
	continue

	copiedSize = 0
	# https://hg.python.org/cpython/file/3.6/Lib/shutil.py#l120
	match = exists
	copied = md5()
	with open(fromPath, 'rb') as fsrc:
	if exists:
	orig = md5()
	with open(toPath, 'rb') as fdst:
	readedSize = 0
	with progressbar.ProgressBar(max_value=progressbar.UnknownLength) as bar:
	start = time.time()
	bar.update(readedSize)
	while match:
	buf = fsrc.read(length)
	orig.update(buf)
	buf2 = fdst.read(length)
	copied.update(buf2)
	if not buf and not buf2:
	break
	if orig.digest() != copied.digest():
	match = False
	readedSize += length
	readTime = time.time() - start
	if readTime == 0:
	readSpeed = float('inf')
	else:
	readSpeed = (readedSize / 1024 / 1024) / readTime
	else:
	readSpeed = float('-inf')
	if match:
	print('Files matches!')
	speed = float('-inf')
	else:
	orig = md5()
	with open(toPath, 'wb') as fdst:
	# https://hg.python.org/cpython/file/3.6/Lib/shutil.py#l76
	with progressbar.ProgressBar(max_value=progressbar.UnknownLength) as bar:
	start = time.time()
	bar.update(copiedSize)
	fsrc.seek(0)
	while not match:
	buf = fsrc.read(length)
	if not buf:
	break
	orig.update(buf)
	fdst.write(buf)
	copiedSize += length
	bar.update(copiedSize)
	copyTime = time.time() - start
	if copyTime == 0:
	speed = float('inf')
	else:
	speed = (copiedSize / 1024 / 1024) / copyTime
	if not match:
	copied = md5()
	hashProgress = 0
	with open(toPath, 'rb') as fdst:
	with progressbar.ProgressBar(max_value=copiedSize) as bar:
	start = time.time()
	bar.update(hashProgress)
	while 1:
	buf = fdst.read(length)
	if not buf:
	break
	copied.update(buf)
	hashProgress += length
	bar.update(hashProgress)
	copyTime = time.time() - start
	if copyTime == 0:
	hashSpeed = float('inf')
	else:
	hashSpeed = (hashProgress / 1024 / 1024) / copyTime
	if orig.digest() != copied.digest():
	raise Exception("Hash mismatch!")
	else:
	hashSpeed = float('-inf')
	shutil.copystat(fromPath, toPath)
	except Exception as e:
	result = False
	c.execute("UPDATE files SET status='E' WHERE path=?", (i,))
	conn.commit()
	print(e)
	print('ERROR: {e}\n'.format(e=str(e)))
	# raise
	else:
	copied_bytes += copiedSize
	copied_cache.append((orig.hexdigest(), i))
	copiedCount += 1
	filesCount -= 1
	if copied_bytes > save_period:
	copied_bytes = 0
	c.executemany("UPDATE files SET status='C', hash=? WHERE path=?", copied_cache)
	conn.commit()
	copied_cache = []
	print('Done. {s:.3f} Mb/s write; {hs:.3f} Mb/s hash check; {rs:.3f} Mb/s read; ({c}/{t}: {f})\n'.format(
	s=speed,
	hs=hashSpeed,
	rs=readSpeed,
	c=copiedCount,
	t=totalFiles,
	f=filesCount
	))
	c.executemany("UPDATE files SET status='C', hash=? WHERE path=?", copied_cache)
	print('All copied!')
	conn.commit()
	conn.close()

	return result


	if __name__ == "__main__":
	if len(argv) == 4:
	result = download(argv[1], argv[2], argv[3])
	while not result:
	result = download(argv[1], argv[2], argv[3])
	print('No errors!')
	else:
	print('Use: download.py fromDir toDir copyId')