Skip to content

Instantly share code, notes, and snippets.

@telenieko
Last active September 26, 2017 07:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save telenieko/6a98b99e33f62a5fbb9a9c188c4575da to your computer and use it in GitHub Desktop.
Save telenieko/6a98b99e33f62a5fbb9a9c188c4575da to your computer and use it in GitHub Desktop.
Script to merge two similar Google Drive directory structures
# -*- coding: utf-8 -*-
# pylint: disable=missing-docstring,wrong-import-position,invalid-name,superfluous-parens
""" google_drive_fix_migration.py <user@domain.com>
License: This code is release to the Public Domain under "CC0 1.0 Universal" License
Author: Marc Fargas <telenieko@telenieko.com>
Background:
Our office did a real-life testdrive of Office 365 after 11 years on Google Apps,
after 3 months we decided to rollback the affected users which went smoothly
except for the OneDrive -> Google Drive migration.
We missed that the propietary migration tool used put all OneDrive documents
inside a folder called "Documents" on the user's Google Drive thus resulting
that a user had:
- Documents before going Office365 inside 'root' (/) *and* inside 'Documents' (/Documents)
- Documents modified in Office365 inside /Documents
- Documents modified in Google (due to being shared with non testdrive users) in /
- Documents created in Office365 inside /Documents
- Two folder trees / and /Documents "somehow identical".
This script is the quickfix we applied.
This is not a one size-fits-all thing, it is published to the Public Domain
for reference and example for anyone stumbling with something similar in the future.
Code was written in September 2017 using Google Drive v3 API.
Details:
This tool uses a Service Account to impersonate the users on Google G Suite,
if you got here you're likely to know what that means or have to means to find
out and set it up. If you can't ... well, then maybe you should not attempt to
use this code!!
The tool will first build an in-memory representation of both folder
structures / and /Documents via load_trees() then compare both with
compare_trees() and lately apply changes with apply_actions()
During comparison the "BREAKUP_DATE" is the Day we moved those users to Office.
For convenience, we store the load_trees result in a shelve file and reread
it on each call, you can purge this "cache" by deleting the shelve file.
Rules:
- If a folder is not owned by the user it will not be traversed, it might be moved.
- If a file or folder has to be moved a) the former parent is removed,
b) the new parent is inserted.
That is: if there are more parents those remain unchanged.
"""
import sys
import os
import logging
import shelve
from datetime import date
# For convenience I put all dependencies inside a lib/ folder so I do not need a virtualenv
# pip install -t lib/ google-api-python-client attrs python-dateutil
BASE = os.path.dirname(__file__)
sys.path.insert(0, os.path.join(BASE, 'lib'))
BREAKUP_DATE = date(2017, 6, 1)
from oauth2client.service_account import ServiceAccountCredentials
import dateutil.parser
import httplib2
import attr
import apiclient.discovery
import apiclient.http
import apiclient.errors
KEYFILE = os.path.join(BASE, "google-service-account-key.json")
logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger('gdrive') # pylint: disable=invalid-name
logger.setLevel(logging.INFO)
FOLDER_ID_TO_ITEM = {}
@attr.s
class DriveItem(object):
""" Simple representation of a Google Drive item. """
source = attr.ib() # The original item data as received from Google.
parent = attr.ib() # The DriveItem instance parent of this.
owned_by_me = attr.ib() # Does the current user own this item?
action = attr.ib(default='E') # What to do with this item.
new_parent = attr.ib(default=None) # If moving item, new parent id.
children = attr.ib(default=attr.Factory(list)) # List of children items (aka: child.parent == this)
def get_service_creds(email, scope):
""" Get Credentials for a given user using the ServiceAccountCredentials. """
main_credentials = ServiceAccountCredentials.from_json_keyfile_name(KEYFILE, scopes=scope)
credentials = main_credentials.create_delegated(email)
http = credentials.authorize(httplib2.Http())
credentials.refresh(http)
return (credentials, http)
def get_drive_service(email):
""" Get a service instance for Google Drive API v2. """
scope = 'https://www.googleapis.com/auth/drive'
creds, http = get_service_creds(email, scope=scope)
drive_service = apiclient.discovery.build('drive', 'v3', http=http)
return drive_service
def am_i_owner(item):
""" Is the currently authenticated user the (or an) owner of the item? """
return bool(item['ownedByMe'])
def build_tree(service, this, ignore=None, pad=0):
""" Build a tree of DriveItem instances,
Ignore anyone which title == ignore.
pad is a padding for print calls.
this is the current DriveItem instance.
service is a service from get_drive_service().
"""
folder_id = this.source['id']
print('{}Gathering item listings for {}...'.format(' '*pad, folder_id))
q = '\'{}\' in parents'.format(folder_id)
page_token = None
while True:
try:
param = {}
if page_token:
param['pageToken'] = page_token
children = service.files().list(q=q, pageToken=page_token,
fields='nextPageToken, files(id, name, mimeType, ownedByMe, kind, md5Checksum, modifiedTime)').execute()
for item in children.get('files', []):
sub = DriveItem(source=item, parent=this, owned_by_me=am_i_owner(item))
if ignore and item['name'] == ignore:
print(u'{}Ignore: {}'.format(' '*pad, item['name']))
continue
if item['mimeType'] == 'application/vnd.google-apps.folder':
print(u'{}Tree: {}'.format(' '*pad, item['name']))
if sub.owned_by_me:
build_tree(service, sub, ignore=None, pad=pad+4)
else:
print(u'{}Skip not owned: {}'.format(' '*pad, item['name']))
continue
if item['kind'] == 'drive#file':
print(u'{}File: {} ({})'.format(' '*pad, item['name'], item['id']))
this.children.append(sub)
page_token = children.get('nextPageToken')
if not page_token:
break
except apiclient.errors.HttpError as e:
print('An error occurred: {}'.format(e))
break
def load_trees(usuario):
""" Read both tree structures for that user from Google. """
service = get_drive_service(usuario)
root_item = service.files().get(fileId='root').execute()
root = DriveItem(source=root_item, parent=None, owned_by_me=True)
build_tree(service, root, ignore='Documents')
search = service.files().list(q="name = 'Documents'",
fields='nextPageToken, files(id, name, mimeType, ownedByMe, kind, md5Checksum, modifiedTime)').execute()
onedrive_trees = []
for onedrive_item in search['files']:
if not onedrive_item['ownedByMe']:
continue
onedrive_root = DriveItem(source=onedrive_item, parent=None, owned_by_me=True)
build_tree(service, onedrive_root)
onedrive_trees.append(onedrive_root)
return root, onedrive_trees
def load_trees_from_shelve(dbname):
""" Load the trees from our cache. """
db = shelve.open(dbname)
res = db['root'], db['onedrive']
db.close()
return res
def save_trees(dbname, root, onedrive_trees):
""" Save the trees to our cache. """
print("Saving tree data on '%s'" % dbname)
db = shelve.open(dbname[:-3])
db['root'] = root
db['onedrive'] = onedrive_trees
db.close()
def compare_trees(left, right, path):
""" Compare the two trees merging left to right.
Actions:
>: Reparent the item
x: Trash the item
-: Do nothing
E: Something wrong... the default action.
"""
lsorted = sorted(left.children,
cmp=lambda x, y: cmp(x.source['name'], y.source['name']))
rsorted = sorted(right.children,
cmp=lambda x, y: cmp(x.source['name'], y.source['name']))
print("{:<15}----- Comparing '{}' with '{}' -----------".format(
path,
left.source['name'].encode('utf-8', errors='replace'),
right.source['name'].encode('utf-8', errors='replace')))
left_continue = right_continue = True
row_format = "{:<15}{:>10} {} {} {:>10}"
lval = rval = ltitle = rtitle = None
while True:
if lval:
print(row_format.format(path, ltitle, lval.action,
getattr(rval, 'action', None), rtitle))
if left_continue:
try:
lval = lsorted.pop(0)
except IndexError:
break
lval.action = 'E'
ltitle = lval.source['name'].encode('utf-8', errors='replace')
left_continue = False
if right_continue:
try:
rval = rsorted.pop(0)
except IndexError:
# Nothing left on the right to check against, so now all is to be moved over.
lval.action = '>'
rtitle = 'NONE'
left_continue = True
right_continue = True
continue
rval.action = 'E'
rtitle = rval.source['name'].encode('utf-8', errors='replace')
right_continue = False
if lval.source['id'] == rval.source['id']:
# EXACTLY the same file. Like when you do ======= in JavaScript.
lval.action = '-'
rval.action = '-'
left_continue = right_continue = True
continue
if ltitle > rtitle:
# No doubt what is in the right is not on the left.
# But we dont care.
right_continue = True
rval.action = '-'
continue
elif lval.source['mimeType'] == 'application/vnd.google-apps.folder':
# Source is a folder..
if rval.source['mimeType'] == 'application/vnd.google-apps.folder' \
and ltitle == rtitle:
# Folders of the same name. No action on the folder itself.
# check inside.
lval.action = '-'
compare_trees(lval, rval, path+ltitle+'/')
left_continue = right_continue = True
continue
else:
# Left and right are unrelated, so we keep ALL of left.
lval.action = '>'
lval.new_parent = right.source['id']
left_continue = True
continue
elif ltitle == rtitle:
# Same titles
if rval.source['mimeType'] == 'application/vnd.google-apps.folder':
# that would be very strange (left not a folder, right a folder, same name...)
lval.action = '>'
lval.new_parent = right.source['id']
rval.action = '-'
left_continue = right_continue = True
elif rval.source['mimeType'].find('application/vnd.google-apps.') == -1 \
and lval.source['md5Checksum'] == rval.source['md5Checksum']:
# Files are identical!!
lval.action = 'x'
rval.action = '-'
left_continue = right_continue = True
else:
# Okay..
lmodtime = dateutil.parser.parse(lval.source['modifiedTime'])
rmodtime = dateutil.parser.parse(rval.source['modifiedTime'])
if lmodtime == rmodtime:
# No same md5 but same dates? WTF? keep both...
lval.action = '>'
rval.action = '-'
if rmodtime.date() < BREAKUP_DATE:
# Right is older than breakup, so it is safe to keep the Left guy.
lval.action = '>'
lval.new_parent = right.source['id']
rval.action = 'x'
else:
# Better be safe... keep both.
lval.action = '>'
lval.new_parent = right.source['id']
rval.action = '-'
left_continue = right_continue = True
continue
else:
lval.action = '>'
left_continue = True
continue
print('{:<15} --------- end compare_trees({}, {})'.format(
path,
left.source['name'].encode('utf-8', errors='replace'),
right.source['name'].encode('utf-8', errors='replace')))
def apply_actions(service, tree, path):
""" Apply the actions from the tree.
Actions:
>: Reparent the item
x: Trash the item
-: Do nothing
E: Something wrong... the default action.
"""
logger.info("%s ----- Apply '%s' -----------", path, tree.source['name'])
deletes = moves = 0
for item in tree.children:
logger.info("%s: %s/%s", item.action, path, item.source['name'])
itemid = item.source['id']
if len(item.children) > 0 and item.action == '-':
# If we are not reparenting item, and it has children, see the children.
apply_actions(service, item, path+'/'+item.source['name'])
elif item.action == '>':
# Change parents, remove tree.source['id'] and add item.new_parent.
service.files().update(fileId=itemid,
addParents=item.new_parent,
removeParents=tree.source['id'],
fields='id, parents').execute()
moves += 1
elif item.action == 'x':
service.files().update(fileId=itemid,
body={'trashed': True}).execute()
deletes += 1
elif item.action == '-':
pass
else:
pass #print("Unhandled item case on apply_actions.")
logger.info("%s ----- END Apply deletes=%d moves=%d '%s' -----------",
path, deletes, moves, tree.source['name'])
if __name__ == '__main__':
USER = sys.argv[1]
DBNAME = '%s.db' % USER.split('@')[0]
if os.path.exists(DBNAME):
print("Loading from Cache '%s'" % DBNAME)
root, onedrive_trees = load_trees_from_shelve(DBNAME)
else:
print("Loading from Google into '%s'" % DBNAME)
root, onedrive_trees = load_trees(USER)
save_trees(DBNAME, root, onedrive_trees)
for tree in onedrive_trees:
compare_trees(tree, root, '/')
cont = raw_input('Apply? (y/n)')
if cont == 'y':
svc = get_drive_service(USER)
for tree in onedrive_trees:
print("#### BEGIN APPLY ACTIONS ONEDRIVE TREE")
apply_actions(svc, tree, '/')
print("#### BEGIN APPLY ACTIONS GOOGLE DRIVE TREE")
apply_actions(svc, root, '/')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment