Skip to content

Instantly share code, notes, and snippets.

@jvperrin
Created July 14, 2016 23:43
Show Gist options
  • Save jvperrin/4ea345868976b2f2d2fab5f7180b230b to your computer and use it in GitHub Desktop.
Save jvperrin/4ea345868976b2f2d2fab5f7180b230b to your computer and use it in GitHub Desktop.
Box.com recursive backup script
#!/usr/bin/env python3
import argparse
import getpass
import json
import os
import subprocess
import sys
import threading
import time
from io import BytesIO
from shutil import get_terminal_size
# Try to import pycurl, error with mediocre installation instructions
try:
import pycurl
except ImportError:
print('Error: Pycurl does not appear to be installed')
print(' Please install it from a package (usually named python3-pycurl)')
print(' or by using pip (pip install pycurl)')
sys.exit(1)
BOX_DAV = 'https://dav.box.com/dav/'
MAX_CONCURR_UPLOADS = 10
class BoxUpload:
def __init__(self, filename, curl_auth):
self.filename = filename
self.auth = curl_auth
self.percentage = 0.0
self.bytes_uploaded = 0
def start(self, folder, upload_folder, conn):
"""Set curl params for the file to upload"""
file_path = folder + '/' + self.filename
conn.upload = self
conn.setopt(conn.PROGRESSFUNCTION, self.progress)
conn.setopt(conn.NOPROGRESS, False)
conn.file_bytes = os.path.getsize(file_path)
conn.fp = open(file_path, 'rb')
conn.setopt(conn.INFILESIZE, conn.file_bytes)
conn.setopt(conn.READFUNCTION, conn.fp.read)
conn.setopt(conn.UPLOAD, True)
conn.setopt(conn.URL, '{}{}/{}'.format(BOX_DAV, upload_folder, self.filename))
conn.setopt(conn.USERPWD, self.auth)
def progress(self, download_t, download_d, upload_t, upload_d):
"""Store the upload progress percentage for periodic printing.
Passed to pycurl as a progress function for each upload
"""
self.bytes_uploaded = upload_d
if upload_t > 0:
self.percentage = 100 * (upload_d / upload_t)
def get_progress(self, terminal_width):
"""Get a nice progress bar including the file and percent complete
The progress bar has a dynamic width, so it must be constructed by
figuring out the correct width based on other data shown
"""
first_half = '{} ['.format(self.filename)
second_half = '] {:.2f}%'.format(self.percentage)
width = terminal_width - len(first_half) - len(second_half)
filled = int(width * self.percentage / 100)
return '{}{}{}{}'.format(
first_half,
filled * '#',
(width - filled) * ' ',
second_half
)
def make_box_folder(folder, auth):
"""Make a folder on Box.com using the MKCOL HTTP request method"""
conn = pycurl.Curl()
print('Creating folder "' + folder + '"')
conn.setopt(conn.CUSTOMREQUEST, 'MKCOL')
conn.setopt(conn.USERPWD, auth)
conn.setopt(conn.URL, BOX_DAV + folder)
conn.perform()
http_status = conn.getinfo(pycurl.HTTP_CODE)
conn.close()
if http_status == 405:
print('Could not create a folder, it already exists, continuing...')
elif http_status == 401:
print('401 error trying to create a folder, check your authentication!')
sys.exit(2)
elif http_status != 201:
print('Error creating folder, HTTP ' + str(http_status))
print('Folder name was ' + folder)
sys.exit(3)
def print_progress(multi, stats, prev_total):
"""Show an updating progress bar for all ongoing uploads"""
width = get_terminal_size((80, 20)).columns # Default size as a fallback
# Move cursor to the top left using ANSII magic to overwrite previous lines
sys.stdout.write('\033[1;1H')
bytes_partial = 0
for conn in multi.handles:
if conn.upload:
print(conn.upload.get_progress(width))
bytes_partial += conn.upload.bytes_uploaded
bytes_total = stats['bytes_uploaded'] + bytes_partial
# Print statistics on upload time and current speed
time_stats = 'Time so far: {} Speed: {:.2f} Mb/s'.format(
friendly_time(time.time() - stats['start_time']),
(bytes_total - prev_total) * 0.000008
)
print(time_stats + (width - len(time_stats)) * ' ')
# Print more stats, this time on amount uploaded
upload_stats = 'Uploaded {}/{} files and {}/{} ({:.2f}%) in total'.format(
stats['num_finished'],
stats['num_total'],
friendly_file_size(bytes_total),
friendly_file_size(stats['bytes_total']),
100 * bytes_total / stats['bytes_total']
)
print(upload_stats + (width - len(upload_stats)) * ' ')
# If there are less progress bars now, add blank rows to overwrite the
# bottom rows as necessary (to get rid of lines with outdated information)
blank_rows = len(multi.free)
for row in range(blank_rows):
print(' ' * width)
# Print the next update in a second, allow interrupts
thread = threading.Timer(1, print_progress, [multi, stats, bytes_total])
thread.daemon = True
thread.start()
def reset_connection(multi, conn):
"""Resets a pycurl connection object to be used again"""
conn.fp.close()
multi.remove_handle(conn)
conn.upload = None
conn.fp = None
conn.file_bytes = 0
multi.free.append(conn)
def friendly_file_size(num_bytes):
"""To convert bytes to human-readable units
Adapted from http://stackoverflow.com/a/1094933/1979001
"""
for unit in ['', 'KB', 'MB', 'GB', 'TB', 'PB']:
if num_bytes < 1024.0:
return '{:3.2f} {}'.format(num_bytes, unit)
num_bytes /= 1024.0
def friendly_time(seconds):
"""Show seconds in terms of larger units for easier reading"""
times = [('seconds', 60), ('minutes', 60), ('hours', 24), ('days', 365)]
for unit, factor in times:
if seconds < factor:
return '{:.2f} {}'.format(seconds, unit)
seconds /= factor
def main(args):
archive = input('Where do you want to upload files from? ')
try:
files_gen = os.walk(archive)
except PermissionError:
print('Could not access files! Make sure the directory exists or try again as root!')
sys.exit(1)
# Get authentication for Box.com and destination folder
email = input('Enter your Box.com email (Berkeley email): ')
password = getpass.getpass('Enter your Box.com password: ')
folder = input('What folder do you want to put the files in on Box.com (will be created)? ')
auth = email + ':' + password
# Create an array of files from the generator
files = [(item[0], item[2]) for item in list(files_gen)]
file_paths = [f[0] + '/' + filename for f in files for filename in f[1]]
# Make the new root folder for the backup
if not args.quiet:
print('Creating folder ' + folder + ' on Box.com...')
make_box_folder(folder, auth)
# Start timer for upload speed and get total upload size
file_bytes = sum(os.lstat(f).st_size for f in file_paths if os.path.exists(f))
if not args.quiet:
print('Uploading {}...'.format(friendly_file_size(file_bytes)))
time.sleep(3)
start_time = time.time()
multi = pycurl.CurlMulti()
multi.handles = []
multi.free = []
stats = {
'num_finished': 0,
'num_total': len(file_paths),
'bytes_uploaded': 0,
'bytes_total': file_bytes,
'start_time': start_time,
}
# Create free connection handlers that can be reused
for i in range(MAX_CONCURR_UPLOADS):
conn = pycurl.Curl()
conn.fp = None
conn.upload = None
conn.file_bytes = 0
multi.handles.append(conn)
multi.free.append(conn)
# Make a queue of all folders to be created with files to be uploaded
queue = []
for f in files:
queue.append(f)
# Files that are safe to upload without having to create a new folder for them
safe_queue = []
# Clear the screen and start printing the progress bar every second
if not args.quiet:
subprocess.call(('clear',))
print_progress(multi, stats, 0)
while stats['num_finished'] < len(file_paths):
# Add uploads if there are connections free to use
if not safe_queue and queue:
item = queue.pop(0)
# Remove the relative path from the folder name
folder_name = item[0][(len(archive) + 1):]
if folder_name:
make_box_folder(folder + '/' + folder_name, auth)
# Add all files to the safe queue for uploading
for f in item[1]:
safe_queue.append((folder_name, f))
while safe_queue and multi.free:
parent_folder, filename = safe_queue.pop(0)
if not os.path.exists(archive + '/' + parent_folder + '/' + filename):
stats['num_finished'] += 1
continue
conn = multi.free.pop()
upload = BoxUpload(filename, auth)
upload.start(archive + '/' + parent_folder, folder + '/' + parent_folder, conn)
multi.add_handle(conn)
# Run curl's internal state machine
while True:
ret, num_handles = multi.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# Check for successful or failed curl connections and free them again
while True:
num_queued, ok_list, err_list = multi.info_read()
# Free up connections from files that have finished
for conn in ok_list:
stats['bytes_uploaded'] += conn.file_bytes
reset_connection(multi, conn)
# Retry files that have errored in some way
for conn, errno, errmsg in err_list:
print('Error uploading {}: Curl error #{} {}'.format(
conn.upload.filename,
errno,
errmsg
))
# Allow exits by SIGINT
if (errno == 42): # Callback aborted exception code
sys.exit(4)
# Retry this file
print('Retrying {}'.format(conn.upload.filename))
queue.insert(0, conn.upload.filename)
reset_connection(multi, conn)
stats['num_finished'] += len(ok_list)
# Move on if no more connections need freeing
if num_queued == 0:
break
# Wait for more data to be available
multi.select(1.0)
# Clean up any open files and connections
for conn in multi.handles:
if conn.fp:
conn.fp.close()
conn.close()
multi.close()
# Print statistics about the backup time, size, and speed
if not args.quiet:
subprocess.call(('clear',))
total_time = time.time() - start_time
transfer_rate = file_bytes / (1000000 * total_time)
print('Box.com upload complete!')
print('Took {} to transfer {} files with combined size {}'.format(
friendly_time(total_time),
stats['num_total'],
friendly_file_size(file_bytes)
))
print('Average rate of {:.2f} Mb/s ({:.2f} MB/s)'.format(
transfer_rate * 8,
transfer_rate
))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Back up files to Box.com')
parser.add_argument('-q', '--quiet', action='store_true',
help='do not output progress to stdout')
args = parser.parse_args()
exit(main(args))
@wylliec
Copy link

wylliec commented Sep 18, 2016

conn.setopt(conn.CAINFO, certifi.where())

@wylliec
Copy link

wylliec commented Sep 18, 2016

also fix spaces edge case

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment