Skip to content

Instantly share code, notes, and snippets.

@zoharbabin
Created October 10, 2023 23:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save zoharbabin/075b317557536f507e69c16203ea81aa to your computer and use it in GitHub Desktop.
Save zoharbabin/075b317557536f507e69c16203ea81aa to your computer and use it in GitHub Desktop.
Recursively migrate all media assets from Google Drive to Kaltura using Bulk Upload CSV
"""
Google Drive Media Extractor for Kaltura Upload:
This script processes a specified Google Drive folder to identify media files
(audio, video, and images) and produces a CSV file suitable for bulk upload
to the Kaltura platform. It recursively traverses through all subfolders,
captures metadata about the media files, and appends them to the CSV. Media
files are determined based on their MIME type.
Author Metadata:
- Name: Zohar Babin
- Date: October 11, 2023
- Contact: @zoharbabin on GitHub / @zohar on X.com
- License: MIT
Setup:
1. Obtain a service account key from the Google Cloud Console:
a. Go to the Google Cloud Console (https://console.cloud.google.com/).
b. Navigate to IAM & Admin > Service accounts.
c. Create a new service account or select an existing one.
d. Under "Keys", add a new JSON key.
e. Save the downloaded JSON file as 'credentials.json' in the script's directory.
f. Ensure the service account has permissions for Drive API access and can
read metadata from the desired Google Drive folder.
2. Install required packages via pip:
- pandas
- google-auth, google-auth-oauthlib, google-auth-httplib2
- google-api-python-client
- halo
Usage:
Run the script from the command line and provide the Google Drive folder ID
and optionally a root category name for the Kaltura CSV:
python script_name.py [folder_id] [root_category_name]
Note: Ensure 'credentials.json' is present in the working directory and contains
the necessary permissions.
"""
import re
import argparse
import pandas as pd
import os
from google.oauth2 import service_account
from googleapiclient.discovery import build
from halo import Halo
# Function to recursively list files and folders from a Google Drive folder and filter out media files.
def list_files(folder_id, folder_name, spinner, folder_path=''):
"""
Recursively list files and folders from a Google Drive folder.
Parameters:
- folder_id (str): The ID of the Google Drive folder.
- folder_name (str): Name of the current folder.
- spinner (halo.Halo): Spinner instance for console output.
- folder_path (str): Path of the current folder, default is empty.
Returns:
None. The function directly appends to the global rows_list.
"""
# Query to fetch files and folders inside the current folder
query = f"'{folder_id}' in parents"
results = drive_service.files().list(q=query, fields="files(id, name, mimeType, owners, description, kind, fileExtension, fileExtension)").execute()
items = results.get('files', [])
for item in items:
mime_type = item['mimeType']
file_id = item['id']
file_name = item['name']
# If the item is a folder, recursively call the function
if mime_type == 'application/vnd.google-apps.folder':
separator = '>' if folder_path else '' # Avoid adding the separator before the first folder
new_folder_path = f'{folder_path}{separator}{file_name}'
list_files(file_id, file_name, spinner, new_folder_path)
# Check if the item is an image, video, or audio file
elif mime_type.startswith(('image/', 'video/', 'audio/')):
spinner.text = f'Processing {file_name} ({mime_type}) in folder {folder_path or '/'}'
media_type = mime_type.split('/')[0] # Extracts "audio", "video", or "image" from mime_type
user_name = item['owners'][0]['displayName']
user_email = item['owners'][0]['emailAddress']
file_extension = item.get('fileExtension', '')
description = item.get('description', '')
download_url = f'https://drive.google.com/uc?export=download&id={file_id}'
row_dict = {
'title': file_name,
'description': f'By {user_name} in {folder_name}. \n{description}',
'tags': '',
'url': download_url,
'contentType': media_type.capitalize(),
'category': folder_path,
'scheduleStartDate': '',
'scheduleEndDate': '',
'thumbnailUrl': '',
'partnerData': '',
'creatorId': user_email,
'entitledUsersEdit': '',
'entitledUsersPublish': '',
'ownerId': user_email
}
rows_list.append(row_dict)
def main(folder_id, root_category_name):
"""
Main function to process files from a Google Drive folder and save them in a CSV file.
Parameters:
- folder_id (str): The ID of the Google Drive folder.
- root_category_name (str): Root category name to be appended to all categories in the CSV.
Returns:
None. Writes results to 'kaltura_upload.csv'.
"""
with Halo(text='Processing files', spinner='dots') as spinner:
list_files(folder_id, 'Root', spinner, root_category_name)
spinner.succeed('Processing completed')
df = pd.DataFrame(rows_list)
df.to_csv('kaltura_upload.csv', index=False, encoding='utf-8-sig')
if __name__ == "__main__":
# Argument parsing to get folder_id and root_category_name from command line
parser = argparse.ArgumentParser(description='Process a Google Drive folder to create a Kaltura bulk upload CSV.')
parser.add_argument('folder_id', help='The ID of the Google Drive folder to process.')
parser.add_argument('root_category_name', nargs='?', default='', help='If provided, will be appended to all categories as the root in the Kaltura CSV')
args = parser.parse_args()
# Load client secrets for Google Drive API
creds = service_account.Credentials.from_service_account_file(
'./credentials.json', scopes=['https://www.googleapis.com/auth/drive.metadata.readonly'])
# Initialize the Google Drive API client
drive_service = build('drive', 'v3', credentials=creds)
# Global list to store rows for CSV
rows_list = []
main(args.folder_id, args.root_category_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment