bolaurent/migrate-google-docs.py

## migrate-google-docs.py
#!/usr/local/bin/python3

# Enable 'Create Audit Fields' [https://help.salesforce.com/articleView?id=000334139&type=1&mode=1]
# Create a custom field Original_Record_Id__c on ContentVersion (you must be in Lightning to do this)

# Find the ID of your ExternalDataSource for Google Drive and update EXTERNAL_SOURCE_ID

# Execute this query and save result to GoogleDocs.csv.
# SELECT
# Id,Name,Url,OwnerId,ParentId,CreatedById,CreatedDate,LastModifiedById,LastModifiedDate
# FROM GoogleDoc
# ORDER BY Id

# Then run this script; it will consume GoogleDocs.csv and write ContentVersion.csv.
# Check ContentVersion.csv; are there any rows with no ExternalDocumentInfo2?
# That is the Id in Google docs; it means that you probably need to add some
# regex patterns.
#
# Check the length of ExternalDocumentInfo2 (in the column named Length).
# If there are some rows that are extra long or short, check whether the
# GoogleDoc actually works. Salesforce did not impose any validation rules
# or sanity checks on the urls that users could save there.

# Some of the old format urls fail when migrated to ContentVersion;
# We put the old GoogleDoc record Id and the url into the Description,
# which might help people find their way.


import csv
import re

EXTERNAL_SOURCE_ID = '0XC3z000000GmbvGAC'

patterns = [
    '.*/d/(.+)/.*',
    '.+/d/(.+)',
    '.*key=(.+)#.*',
    '.+folders/(.+)',
    '.+\?docid=(.+)',
    '.+/document/d/(.+)',
    '.+&key=(.+)',
    '.+\?key=(.+)',
    '.+/folderview\?id=(.+)&.+',
    '.+/open\?id=(.+)',
    '.+&id=(.+)'
]


cvsFieldNames = [
    'ContentLocation',
    'Origin',
    'OwnerId',
    'Title',
    'PathOnClient',
    'ExternalDataSourceId',
    'FirstPublishLocationId',
    'Original_Record_Id__c',
    'CreatedById',
    'CreatedDate',
    'LastModifiedById',
    'LastModifiedDate',
    'Description',
    'ExternalDocumentInfo1',
    'ExternalDocumentInfo2',
    'Pattern',
    'Length'
]


def main():
    patternsByName = {item:re.compile(item) for item in patterns}


    with open('ContentVersion.csv', 'w', newline='') as csvOutfile:
        writer = csv.DictWriter(csvOutfile, fieldnames=cvsFieldNames)
        writer.writeheader()

        with open('GoogleDocs.csv', newline='') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                url = row['Url']
                gdocid = ''
                matchedpattern = ''

                for patternname, pattern in patternsByName.items():
                    m = pattern.match(url)
                    if m:
                        gdocid = m.group(1)
                        matchedpattern = patternname
                        break

                writer.writerow({
                    'ContentLocation' : 'E',
                    'Origin' : 'H',
                    'OwnerId' : row['OwnerId'],
                    'Title' : row['Name'],
                    'PathOnClient' : row['Name'],
                    'ExternalDataSourceId' : EXTERNAL_SOURCE_ID,
                    'FirstPublishLocationId' : row['ParentId'],
                    'Original_Record_Id__c' : row['Id'],
                    'CreatedById' : row['CreatedById'],
                    'CreatedDate' : row['CreatedDate'],
                    'LastModifiedById' : row['LastModifiedById'],
                    'LastModifiedDate' : row['LastModifiedDate'],
                    'ExternalDocumentInfo1' : row['Url'],
                    'ExternalDocumentInfo2' : gdocid,
                    'Pattern' : matchedpattern,
                    'Length' : len(gdocid) if gdocid is not None else 0,
                    'Description' : 'Migrated from GoogleDoc Id {}, url {}'.format(row['Id'], row['Url']) +
                        "\n If the 'Open' link does not work, try pasting this link into your browser."
                })

if __name__ == "__main__":
    main()
    pass
	#!/usr/local/bin/python3

	# Enable 'Create Audit Fields' [https://help.salesforce.com/articleView?id=000334139&type=1&mode=1]
	# Create a custom field Original_Record_Id__c on ContentVersion (you must be in Lightning to do this)

	# Find the ID of your ExternalDataSource for Google Drive and update EXTERNAL_SOURCE_ID

	# Execute this query and save result to GoogleDocs.csv.
	# SELECT
	# Id,Name,Url,OwnerId,ParentId,CreatedById,CreatedDate,LastModifiedById,LastModifiedDate
	# FROM GoogleDoc
	# ORDER BY Id

	# Then run this script; it will consume GoogleDocs.csv and write ContentVersion.csv.
	# Check ContentVersion.csv; are there any rows with no ExternalDocumentInfo2?
	# That is the Id in Google docs; it means that you probably need to add some
	# regex patterns.
	#
	# Check the length of ExternalDocumentInfo2 (in the column named Length).
	# If there are some rows that are extra long or short, check whether the
	# GoogleDoc actually works. Salesforce did not impose any validation rules
	# or sanity checks on the urls that users could save there.

	# Some of the old format urls fail when migrated to ContentVersion;
	# We put the old GoogleDoc record Id and the url into the Description,
	# which might help people find their way.


	import csv
	import re

	EXTERNAL_SOURCE_ID = '0XC3z000000GmbvGAC'

	patterns = [
	'./d/(.+)/.',
	'.+/d/(.+)',
	'.key=(.+)#.',
	'.+folders/(.+)',
	'.+\?docid=(.+)',
	'.+/document/d/(.+)',
	'.+&key=(.+)',
	'.+\?key=(.+)',
	'.+/folderview\?id=(.+)&.+',
	'.+/open\?id=(.+)',
	'.+&id=(.+)'
	]


	cvsFieldNames = [
	'ContentLocation',
	'Origin',
	'OwnerId',
	'Title',
	'PathOnClient',
	'ExternalDataSourceId',
	'FirstPublishLocationId',
	'Original_Record_Id__c',
	'CreatedById',
	'CreatedDate',
	'LastModifiedById',
	'LastModifiedDate',
	'Description',
	'ExternalDocumentInfo1',
	'ExternalDocumentInfo2',
	'Pattern',
	'Length'
	]


	def main():
	patternsByName = {item:re.compile(item) for item in patterns}


	with open('ContentVersion.csv', 'w', newline='') as csvOutfile:
	writer = csv.DictWriter(csvOutfile, fieldnames=cvsFieldNames)
	writer.writeheader()

	with open('GoogleDocs.csv', newline='') as csvfile:
	reader = csv.DictReader(csvfile)
	for row in reader:
	url = row['Url']
	gdocid = ''
	matchedpattern = ''

	for patternname, pattern in patternsByName.items():
	m = pattern.match(url)
	if m:
	gdocid = m.group(1)
	matchedpattern = patternname
	break

	writer.writerow({
	'ContentLocation' : 'E',
	'Origin' : 'H',
	'OwnerId' : row['OwnerId'],
	'Title' : row['Name'],
	'PathOnClient' : row['Name'],
	'ExternalDataSourceId' : EXTERNAL_SOURCE_ID,
	'FirstPublishLocationId' : row['ParentId'],
	'Original_Record_Id__c' : row['Id'],
	'CreatedById' : row['CreatedById'],
	'CreatedDate' : row['CreatedDate'],
	'LastModifiedById' : row['LastModifiedById'],
	'LastModifiedDate' : row['LastModifiedDate'],
	'ExternalDocumentInfo1' : row['Url'],
	'ExternalDocumentInfo2' : gdocid,
	'Pattern' : matchedpattern,
	'Length' : len(gdocid) if gdocid is not None else 0,
	'Description' : 'Migrated from GoogleDoc Id {}, url {}'.format(row['Id'], row['Url']) +
	"\n If the 'Open' link does not work, try pasting this link into your browser."
	})

	if __name__ == "__main__":
	main()
	pass