Skip to content

Instantly share code, notes, and snippets.

@motchy869
Last active November 17, 2023 13:01
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save motchy869/3ad7141b6efd8decbd13807418c4e79a to your computer and use it in GitHub Desktop.
Save motchy869/3ad7141b6efd8decbd13807418c4e79a to your computer and use it in GitHub Desktop.
get whole item tree of Google Drive
from enum import Enum
import time
import os
import httplib2
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
def get_credentials():
home_FOLDER = os.path.expanduser('~')
credential_FOLDER = os.path.join(home_FOLDER, '.credentials')
if not os.path.exists(credential_FOLDER):
os.makeFOLDERs(credential_FOLDER)
credential_path = os.path.join(credential_FOLDER,
'drive-python-quickstart.json')
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
credentials = tools.run_flow(flow, store, flags)
print('Storing credentials to ' + credential_path)
return credentials
#class for GoogleDrive item such as file/folder
class Node():
class FileType(Enum):
FILE = 0
FOLDER = 1
def __init__(self, path, depth, file_type, file_id):
self.path = path
self.depth = depth
self.file_type = file_type
self.file_id = file_id
self.children = []
def print_children(self):
print('{0}{1} (ID: {2} file_type: {3})'.format(' '*self.depth, os.path.basename(self.path), self.file_id, self.file_type))
for child in self.children:
child.print_children()
def count_children(self):
num_files = 0; num_folders = 0
for child in self.children:
if child.file_type == Node.FileType.FILE:
num_files += 1
if child.file_type == Node.FileType.FOLDER:
num_folders += 1
a,b = child.count_children()
num_files += a; num_folders += b
return (num_files, num_folders)
#search child items recursively
def search(self, drive_service):
MAX_PAGE_SIZE_PER_REQUEST = 100
if self.file_type == Node.FileType.FILE:
return
self.children.clear()
page_token = None
while True: #Sometimes we cannot get all childs at one time when items are too many. We must be able to handle such a case.
response = drive_service.files().list(
q="'%s' in parents and trashed=false" % self.file_id,
corpus='user', includeTeamDriveItems=False, orderBy='name', pageSize=MAX_PAGE_SIZE_PER_REQUEST, spaces='drive',
fields='nextPageToken, files(id, name, mimeType)', supportsTeamDrives=False,
pageToken=page_token
).execute()
for file in response.get('files', []):
print('Found file: %s (%s), mimeType: %s' % (file['name'], file['id'], file['mimeType']))
if file['mimeType'] == 'application/vnd.google-apps.folder':
file_type = Node.FileType.FOLDER
else:
file_type = Node.FileType.FILE
#new child
self.children.append(Node(path=self.path+'/'+file['name'], depth=self.depth+1, file_type=file_type, file_id=file['id']))
page_token = response.get('nextPageToken', None)
if page_token is None:
break;
#do the same for childs (recursive!)
for child in self.children:
child.search(drive_service)
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
start_time = time.time()
root = Node(path='root', depth=0, file_type=Node.FileType.FOLDER, file_id='root')
root.search(service)
root.print_children()
print('\nThere are {0} files and {1} folders.'.format(*root.count_children()))
print('Search took {0} [sec].'.format(time.time()-start_time))
if __name__ == '__main__':
main()
from enum import Enum
from queue import Queue
import time
import os
import httplib2
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
def get_credentials():
home_FOLDER = os.path.expanduser('~')
credential_FOLDER = os.path.join(home_FOLDER, '.credentials')
if not os.path.exists(credential_FOLDER):
os.makeFOLDERs(credential_FOLDER)
credential_path = os.path.join(credential_FOLDER,
'drive-python-quickstart.json')
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
credentials = tools.run_flow(flow, store, flags)
print('Storing credentials to ' + credential_path)
return credentials
#class for GoogleDrive item such as file/folder
class Node():
class FileType(Enum):
FILE = 0
FOLDER = 1
def __init__(self, path, depth, file_type, file_id):
self.path = path
self.depth = depth
self.file_type = file_type
self.file_id = file_id
self.children = []
def print_children(self):
print('{0}{1} (ID: {2} file_type: {3})'.format(' '*self.depth, os.path.basename(self.path), self.file_id, self.file_type))
for child in self.children:
child.print_children()
def count_children(self):
num_files = 0; num_folders = 0
for child in self.children:
if child.file_type == Node.FileType.FILE:
num_files += 1
if child.file_type == Node.FileType.FOLDER:
num_folders += 1
a,b = child.count_children()
num_files += a; num_folders += b
return (num_files, num_folders)
#search child items using BFS-serach
@classmethod
def search(cls, start_node, drive_service):
MAX_PAGE_SIZE_PER_REQUEST = 100
#initialize node queue
queue = Queue()
queue.put(start_node)
while not queue.empty():
node = queue.get()
if node.file_type == Node.FileType.FILE: #Obviously, file has no child items
continue
#get child item list
node.children.clear()
page_token = None
while True: #Sometimes we cannot get all childs at one time when items are too many. We must be able to handle such a case.
response = drive_service.files().list(
q="'%s' in parents and trashed=false" % node.file_id,
corpus='user', includeTeamDriveItems=False, orderBy='name', pageSize=MAX_PAGE_SIZE_PER_REQUEST, spaces='drive',
fields='nextPageToken, files(id, name, mimeType)', supportsTeamDrives=False,
pageToken=page_token
).execute()
for file in response.get('files', []):
print('Found file: %s (%s), mimeType: %s' % (file['name'], file['id'], file['mimeType']))
if file['mimeType'] == 'application/vnd.google-apps.folder':
file_type = Node.FileType.FOLDER
else:
file_type = Node.FileType.FILE
child = Node(path=node.path+'/'+file['name'], depth=node.depth+1, file_type=file_type, file_id=file['id']) #new child
node.children.append(child)
queue.put(child) #add child to search queue (BFS!)
page_token = response.get('nextPageToken', None)
if page_token is None:
break;
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
start_time = time.time()
root = Node(path='root', depth=0, file_type=Node.FileType.FOLDER, file_id='root')
root.search(root, service)
root.print_children()
print('\nThere are {0} files and {1} folders.'.format(*root.count_children()))
print('Search took {0} [sec].'.format(time.time()-start_time))
if __name__ == '__main__':
main()
from enum import Enum
from queue import Queue
import os
import time
import httplib2
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
def get_credentials():
home_FOLDER = os.path.expanduser('~')
credential_FOLDER = os.path.join(home_FOLDER, '.credentials')
if not os.path.exists(credential_FOLDER):
os.makeFOLDERs(credential_FOLDER)
credential_path = os.path.join(credential_FOLDER,
'drive-python-quickstart.json')
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
credentials = tools.run_flow(flow, store, flags)
print('Storing credentials to ' + credential_path)
return credentials
#class for GoogleDrive item such as file/folder
class Node():
class FileType(Enum):
FILE = 0
FOLDER = 1
def __init__(self, path, depth, file_type, file_id):
self.path = path
self.depth = depth
self.file_type = file_type
self.file_id = file_id
self.children = []
def print_children(self):
print('{0}{1} (ID: {2} file_type: {3})'.format(' '*self.depth, os.path.basename(self.path), self.file_id, self.file_type))
for child in self.children:
child.print_children()
def count_children(self):
num_files = 0; num_folders = 0
for child in self.children:
if child.file_type == Node.FileType.FILE:
num_files += 1
if child.file_type == Node.FileType.FOLDER:
num_folders += 1
a,b = child.count_children()
num_files += a; num_folders += b
return (num_files, num_folders)
#search child items using combined method of BFS-serach and batch-processing
@classmethod
def search(cls, start_node, drive_service):
MAX_API_CALLS_PER_BATCH = 100 #Google says this is the largest value we can use
MAX_PAGE_SIZE_PER_REQUEST = 10
#class for single request. We pack these requests, as many as possible (up to 100), into every batch.
class SingleRequest():
def __init__(self, node, page_token):
self.node = node
self.page_token = page_token
#callback function for batch proessing.
#In every batch process, the argument 'request_id' starts at 1 and increases by 1 for each callback.
#When a batch is completed and another batch starts, 'request_id' returns to 1.
def callback(request_id, response, exception):
parent_node, page_token = req_id_vs_single_request_correspondence_table[request_id] #specify parent node associated with this request, using correspondence table
print('{0}in callback, request_id: {1}'.format(' '*parent_node.depth, request_id))
print('{0}parent node: {1}'.format(' '*parent_node.depth, os.path.basename(parent_node.path)))
if exception:
if exception._get_reason() == 'User Rate Limit Exceeded' or exception._get_reason() == 'Rate Limit Exceeded':
print('Rate Limit Exceeded !!!')
queue.put(SingleRequest(node=parent_node, page_token=page_token)) #re-queue this failed request
print('{0}re-queueing: {1}'.format(' '*parent_node.depth, os.path.basename(parent_node.path)))
print('{0}-> queue size: {1}'.format(' '*parent_node.depth, queue.qsize()))
nonlocal reached_rate_limit
reached_rate_limit = True #tell event to batch maker
else:
print(exception)
raise Exception('Unknown Google Drive REST API Error !!!')
return
for file in response.get('files', []):
print('{0}Found file: {1} ({2}), mimeType: {3}'.format(' '*(parent_node.depth+1), file['name'], file['id'], file['mimeType']))
if file['mimeType'] == 'application/vnd.google-apps.folder':
file_type = Node.FileType.FOLDER
else:
file_type = Node.FileType.FILE
child = Node(path=parent_node.path+'/'+file['name'], depth=parent_node.depth+1, file_type=file_type, file_id=file['id'])
parent_node.children.append(child)
queue.put(SingleRequest(node=child, page_token=None)) #add child node to search queue
print('{0}queueing: {1}'.format(' '*child.depth, file['name']))
page_token = response.get('nextPageToken', None)
if page_token is not None: #When this response has next chunk, we must put the chunk to request so that it will be handled immediately after currently queued requests.
print('{0}next page: {1}'.format(' '*parent_node.depth, page_token))
queue.put(SingleRequest(node=parent_node, page_token=page_token))
print('{0}queueing: {1}'.format(' '*parent_node.depth, os.path.basename(parent_node.path)))
print('{0}-> queue size: {1}'.format(' '*(parent_node.depth+1), queue.qsize()))
start_node.children.clear()
#initialize request queue
queue = Queue()
queue.put(SingleRequest(node=start_node, page_token=None))
reached_rate_limit = False
print('{0}queueing: {1}'.format(' '*start_node.depth, os.path.basename(start_node.path)))
while not queue.empty():
#---------- make batch
if reached_rate_limit == True: #When we reached '(User) Rate Limit Exceeded', we have to wait for a seconds, otherwise next batch will fail.
time.sleep(2)
reached_rate_limit = False
batch = drive_service.new_batch_http_request(callback=callback)
batch_size = 0
#correspondence table for 'request_id(str)'(an argument for callback function) and associated (node, page_token) pair.
#This is a dict {request_id: (node, page_token)}.
#Sometimes we may encounter '(User) Rate Limit Exceeded' exception because there may be too many items to handle at once, and need to re-queue the failed request.
#In this time we need (node, page_token) pair information.
#Since it is difficult to specify, in the callback function, the (node, page_token) pair associated with the request,
#we make correspondence table, and refer it in callback function.
req_id_vs_single_request_correspondence_table = {}
while (not queue.empty()) and (batch_size < MAX_API_CALLS_PER_BATCH): #pack requests as many as possible
single_request = queue.get()
node = single_request.node
print('{0}pop {1} from queue and adding to batch'.format(' '*node.depth, os.path.basename(node.path)))
if node.file_type == Node.FileType.FILE:
continue
batch.add(
drive_service.files().list(
q="'%s' in parents and trashed=false" % node.file_id,
corpus='user', includeTeamDriveItems=False, orderBy='name', pageSize=100, spaces='drive',
fields='nextPageToken, files(id, name, mimeType)', supportsTeamDrives=False,
pageToken=single_request.page_token
)
)
batch_size += 1
req_id_vs_single_request_correspondence_table[str(batch_size)] = (node, single_request.page_token)
#----------
print('{0}-> queue size: {1}'.format(' '*node.depth, queue.qsize()))
print('executing batch')
batch.execute()
print('----------\n')
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
start_time = time.time()
root = Node(path='root', depth=0, file_type=Node.FileType.FOLDER, file_id='root')
root.search(root, service)
print('tree structure:\n')
root.print_children()
print('\nThere are {0} files and {1} folders.'.format(*root.count_children()))
print('Search took {0} [sec].'.format(time.time()-start_time))
if __name__ == '__main__':
main()
"""
test of search-parent method
"""
from enum import Enum
import argparse
import time
import logging
import os
import httplib2
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
#logging settings
logger = logging.getLogger(__name__); logger.setLevel(logging.DEBUG) #output DEBUG or higher level messages
fmt = logging.Formatter('%(asctime)s - %(name)s - %(threadName)s - %(levelname)s: %(message)s')
log_sh = logging.StreamHandler();\
log_sh.setLevel(logging.DEBUG);\
log_sh.setFormatter(fmt)
log_fh = logging.FileHandler('debug.log');\
log_fh.setLevel(logging.DEBUG);\
log_fh.setFormatter(fmt)
log_efh = logging.FileHandler('error.log');\
log_efh.setLevel(logging.ERROR);\
log_efh.setFormatter(fmt)
logger.addHandler(log_sh); logger.addHandler(log_fh); logger.addHandler(log_efh)
def get_credentials():
home_FOLDER = os.path.expanduser('~')
credential_folder = os.path.join(home_FOLDER, '.credentials')
if not os.path.exists(credential_folder):
os.makeFOLDERs(credential_folder)
credential_path = os.path.join(credential_folder, 'drive-python-quickstart.json')
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
credentials = tools.run_flow(flow, store, flags)
print('Storing credentials to ' + credential_path)
return credentials
class FileType(Enum):
"""
file type
"""
FILE = 0
FOLDER = 1
class Node():
"""
class for Google Drive item
"""
def __init__(self, path, basename, depth, file_type, file_id):
self.path = path
self.basename = basename
self.depth = depth
self.file_type = file_type
self.file_id = file_id
self.children = []
def print_children(self):
"""
print all children
"""
print('{0}{1} (ID: {2} file_type: {3})'.format(' '*self.depth, os.path.basename(self.path), self.file_id, self.file_type))
for child in self.children:
child.print_children()
def count_children(self):
"""
count all children
"""
num_files = 0; num_folders = 0
for child in self.children:
if child.file_type == FileType.FILE:
num_files += 1
if child.file_type == FileType.FOLDER:
num_folders += 1
a,b = child.count_children()
num_files += a; num_folders += b
return (num_files, num_folders)
def complement_children_path_depth(self):
"""
generate children's path and depth information from basename
"""
for child in self.children:
child.path = '{0}/{1}'.format(self.path, child.basename)
child.depth = self.depth+1
child.complement_children_path_depth()
def get_whole_tree(drive_service):
"""
get whole Google Drive item tree
"""
MAX_PAGE_SIZE_PER_REQUEST = 100
email_address = drive_service.about().get(fields='user(emailAddress)').execute().get('user')['emailAddress']
root_id = drive_service.files().get(fileId='root', supportsTeamDrives=False, fields='id', ).execute().get('id')
root = Node(path='root', basename='root', depth=0, file_type=FileType.FOLDER, file_id=root_id)
#get all items
nodes = {root_id: (root, None)}
page_token = None
while True:
response = drive_service.files().list(corpus='user', includeTeamDriveItems=False, orderBy='name', pageSize=MAX_PAGE_SIZE_PER_REQUEST, pageToken=page_token, q="trashed=false and '{0}' in owners".format(email_address), spaces='drive', supportsTeamDrives=False, fields="nextPageToken, files(id, name, mimeType, parents)").execute()
items = response.get('files', [])
for item in items:
file_name = item['name']
file_id = item['id']
parent_id = item['parents'][0]
if item['mimeType'] == 'application/vnd.google-apps.folder':
file_type = FileType.FOLDER
else:
file_type = FileType.FILE
node = Node(path=None, basename=file_name, depth=None, file_type=file_type, file_id=file_id)
nodes[file_id] = (node, parent_id)
#logger.debug('file_name: {0}, file_id: {1}, parent_id: {2}'.format(file_name, file_id, parent_id))
page_token = response.get('nextPageToken', None)
if page_token is None:
break;
#connect to parent
for file_id, (node, parent_id) in nodes.items():
if parent_id is None: #root node
continue
nodes[parent_id][0].children.append(node)
root.complement_children_path_depth()
return root
if __name__ == '__main__':
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
start_time = time.time()
root = get_whole_tree(drive_service=service)
root.print_children()
print('\nThere are {0} files and {1} folders.'.format(*root.count_children()))
print('Search took {0} [sec].'.format(time.time()-start_time))
@motchy869
Copy link
Author

motchy869 commented Nov 1, 2017

Recently, I needed to get whole item tree in Google Drive automatically. I found Python3 can handle Google Drive REST v3 API. I tried DFS-serarch firstly. Of course it worked, but very slow because each 'list' request makes its own HTTP connection. Then I tried BFS-search and extend it to use batch processing. It worked much faster than no-batch processing.

I tested the DFS-search and BFS+batch, and measured time consumption. no-batch-BFS-serach had similar result to DFS-search. I run these 2 codes in my Google Drive which has 1754 files and 1028 folders. DFS method took 409 sec. BFS+batch method took only 64 sec. BFS+batch method saves muth time.

@motchy869
Copy link
Author

motchy869 commented Nov 4, 2017

Next day I came up with more efficient way.

1.Get all nodes (include information: name, id, mimeType, parents) using files().get() method.
2.Refer each item's parent attribute, then connect to parent correctly.
3.Now you have a Spanning-Tree you wanted.

Applying this way (4_search-parent.py), It took only 17.3 sec to specify whole tree in my Google Drive which has 1754 files and 1028 folders (same condition as my previous comment).

@helpse
Copy link

helpse commented Dec 28, 2019

Hey motchy. What would you think about starting the frontend interface using React?

@motchy869
Copy link
Author

Hi

Thank you for comment. I'm so busy in these days, but I'll try it later.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment