phette23/uptaxo.py

## uptaxo.py
#!/usr/bin/env python2
# ################################# #
# UploadTaxonomy.py                 #
#                                   #
# added CLI by Eric Phetteplace     #
# California College of the Arts    #
# vault.cca.edu | libraries.cca.edu #
# 2014-07-32                        #
#                                   #
# Pearson's notes below             #
# ################################# #


# UploadTaxonomy.py
# Author: Jim Kurian, Pearson
# Date: November 15, 2012
#
# This script populates a taxonomy in EQUELLA 4.1 and higher.
# It requires Python 2.7.
#
# USAGE:
# It requires a CSV file to be formatted as follows:
#
# term0\term00\term000
# term0\term00\term001
# term1
# term1\term10
# term1\term11
# etc...
#
# If keys are required then following format should be used:
#
# term0\term00\term000, key1,      value1, key2, value2, key3, value3
# term0\term00\term001, key1,      value1,     ,       ,     ,
# term1               , key1,      value1, key2, value2,     ,
# term1\term10        , LONG_DATA, value1, key2, value2,     ,
# term1\term11        , LONG_DATA, value1,     ,       ,     ,
# etc...
#
# Any number of key/value pairs can be added to each row however
# each row of the csv must have the same number of columns. Value
# fields with commas or quotes in them should be surrounded by
# double quotes. MS Excel does this for you automatically.
# Use the key "LONG_DATA" to add a HTML Fragments to a term for
# display in the pop-view view of the term selector control.
# Set the EQUELLA connection settings in settings.py and the
# location of the CSV file and the taxonomy ID in the variables
# below.
#
# NOTE:
# This script requires the deleteTerm() method to be implemented
# in equellaSoap.py module. This function is not present in the
# equellaSoap.py module in the integration pack that ships with
# EQUELLA 5.0 or earlier. Use the module provided with this
# script.
import csv
import codecs
import sys
import traceback
import time
import equellasoap
import argparse

parser = argparse.ArgumentParser(description='Upload a taxonomy to VAULT.')
parser.add_argument('--un', nargs='?', action='store', help='EQUELLA Username')
parser.add_argument('--pw', nargs='?', action='store', required=True,
                    help='EQUELLA Password')
parser.add_argument('--tid', nargs='?', action='store', required=True,
                    help='Taxonomy ID')
parser.add_argument('--csv', nargs='?', action='store',
                    help='Path to taxonomy CSV file')
parser.add_argument('--clear', action='store_true',
                    help='Clear the taxonomy (delete all prior terms)')
parser.add_argument('--start', default=1, type=int,
                    help='Row of the CSV to start at, e.g. use 2 to skip header row')

args = parser.parse_args()

csvFileName = args.csv  # relative path to CSV file to load

# Pearson's code below except for calls to args

# ---- SETTINGS ---- #

# EQUELLA parameters
institutionUrl = 'https://vault.cca.edu'
username = args.un
password = args.pw

# CSV unicode encoding
# typically "utf-8" or "latin1" (Excel often saves non-Ascii CSV files as "latin1")
encoding = "utf-8"
# id of taxonomy to update (retrieve from Admin Console)
taxonID = args.tid
# set clearTaxonomy to True if taxonomy is to be cleared first
clearTaxonomy = args.clear
# which row in the CSV to start processing at (e.g. 1 to process entire CSV)
startRow = args.start
# set to true to only unlock taxonomy (no uploading)
unlockOnly = False
# set to true to only clear taxonomy (no uploading)
clearOnly = False
# set to true to cause script to retry on error (useful for poor network connections)
retryOnError = False

# optional EQUELLA parameters:
proxyUrl = None
useTokens = False
tokenUser = 'tokenUsername'
sharedSecretId = 'tokenSecretId'
sharedSecretValue = 'tokenSecretValue'

# --- END SETTINGS --- #


def unicode_csv_reader(utf8_data, encoding, dialect=csv.excel, **kwargs):
        csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
        firstRow = True
        for row in csv_reader:
            # remove BOM for utf-8
            if firstRow:
                if row[0].startswith(codecs.BOM_UTF8):
                    row[0] = row[0].decode("utf-8")[1:]
                firstRow = False

            yield [cell.decode(encoding) for cell in row]


if not institutionUrl.endswith("/"):
        institutionUrl += "/"
# test logging in and logging out of EQUELLA
equella = equellasoap.EquellaSoap(institutionUrl, username, password, proxyUrl)
equella.logout()
# test unlocking taxonomy
try:
        equella.unlockTaxonomy(taxonID, force=1)
except:
        raise "Unable to unlock taxonomy, check taxonomy ID"
# test opening and closing CSV
csvFile = open(csvFileName, "rb")
csvFile.close()

rownum = 0
lastAttemptedRow = 0
lastAttemptedTerm = ""
incomplete = True
retrying = False
while incomplete:
        try:

                # instantiate an EquellaSoap object and lock the taxonomy for editing
                print "Logging into EQUELLA..."
                equella = equellasoap.EquellaSoap(institutionUrl, username, password, proxyUrl)
                equella.unlockTaxonomy(taxonID, force=1)
                if unlockOnly:
                        print "Taxonomy successfully unlocked."
                        sys.exit("Unlock taxonomy only")

                print "Locking taxonomy for editing..."
                equella.lockTaxonomyForEditing(taxonID)

                # clear the taxonomy in EQUELLA
                if clearTaxonomy or clearOnly:
                        print "Clearing taxonomy..."
                        rootTerms = equella.listTerms(taxonID, "")
                        for rootTerm in rootTerms:
                                equella.deleteTerm(taxonID, rootTerm)
                        if clearOnly:
                                print "Taxonomy successfully cleared."
                                sys.exit("Clear taxonomy only")

                # read CSV file
                print "Importing taxonomy from CSV starting from row %s..." % (startRow)
                csvFile = open(csvFileName, "rb")
                reader = csv.reader(csvFile, dialect=csv.excel)

                # get row count
                rowcount = len(list(reader))
                csvFile.close()
                csvFile = open(csvFileName, "rbU")
                reader = unicode_csv_reader(csvFile, encoding)

                # remove last term in case it was only partially completed
                if lastAttemptedRow > 0:
                        try:
                                if len(equella.listTerms(taxonID, lastAttemptedTerm)) == 0:
                                        print "Deleting " + lastAttemptedTerm + " to reimport..."
                                        equella.deleteTerm(taxonID, lastAttemptedTerm)
                        except:
                                pass

                # iterate through csv rows
                for row in reader:
                        rownum += 1
                        if rownum >= startRow:

                                lastAttemptedRow = rownum

                                print "Processing row %s of %s" % (rownum, rowcount)
                                terms = row[0].split("\\")
                                parentTerm = ""
                                # iterate through terms of path
                                termsAdded = False
                                for i in range(0, len(terms)):
                                        term = terms[i].strip()

                                        fullTermPath = ""
                                        if parentTerm == "":
                                                fullTermPath = term
                                        else:
                                                fullTermPath = parentTerm + "\\" + term

                                        lastAttemptedTerm = fullTermPath

                                        # if term deosn't exist add it
                                        siblingTerms = equella.listTerms(taxonID, parentTerm)
                                        if term not in siblingTerms:

                                                termsAdded = True
                                                print "  adding " + fullTermPath
                                                equella.insertTerm(taxonID, parentTerm, term)
                                                # Add data to term if it is a leaf node
                                                for j in range(1, len(row), 2):
                                                        key = row[j].strip()
                                                        if i == len(terms) - 1 and key != "":
                                                                print "    adding data key: " + key
                                                                equella.setTermData(taxonID, fullTermPath, key, row[j + 1].strip())

                                        # set current term as parent for next row
                                        # (i.e. term) in the CSV
                                        parentTerm = fullTermPath

                                if not termsAdded:
                                        print "  no new terms to be added"

                incomplete = False

        except:
                exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
                error = ''.join(traceback.format_exception(exceptionType, exceptionValue, exceptionTraceback))
                if "Clear taxonomy only" in error or "Unlock taxonomy only" in error:
                        incomplete = False
                elif retryOnError:
                        # retry
                        print "Error occured: " + str(exceptionValue)
                        print "  Last Attempted Term: " + lastAttemptedTerm
                        print "  Last Attempted Row: " + str(lastAttemptedRow)
                        print "  Retrying..."
                        time.sleep(2)

                        clearTaxonomy = False
                        startRow = lastAttemptedRow
                        rownum = 0
                else:
                        raise Exception, error


# unlock the taxonomy, logout of EQUELLA and close the csv file
equella.unlockTaxonomy(taxonID)
equella.logout()
if not clearOnly and not unlockOnly:
        csvFile.close()

print "-- PROCESSING COMPLETE --"
	#!/usr/bin/env python2
	# ################################# #
	# UploadTaxonomy.py #
	# #
	# added CLI by Eric Phetteplace #
	# California College of the Arts #
	# vault.cca.edu \| libraries.cca.edu #
	# 2014-07-32 #
	# #
	# Pearson's notes below #
	# ################################# #


	# UploadTaxonomy.py
	# Author: Jim Kurian, Pearson
	# Date: November 15, 2012
	#
	# This script populates a taxonomy in EQUELLA 4.1 and higher.
	# It requires Python 2.7.
	#
	# USAGE:
	# It requires a CSV file to be formatted as follows:
	#
	# term0\term00\term000
	# term0\term00\term001
	# term1
	# term1\term10
	# term1\term11
	# etc...
	#
	# If keys are required then following format should be used:
	#
	# term0\term00\term000, key1, value1, key2, value2, key3, value3
	# term0\term00\term001, key1, value1, , , ,
	# term1 , key1, value1, key2, value2, ,
	# term1\term10 , LONG_DATA, value1, key2, value2, ,
	# term1\term11 , LONG_DATA, value1, , , ,
	# etc...
	#
	# Any number of key/value pairs can be added to each row however
	# each row of the csv must have the same number of columns. Value
	# fields with commas or quotes in them should be surrounded by
	# double quotes. MS Excel does this for you automatically.
	# Use the key "LONG_DATA" to add a HTML Fragments to a term for
	# display in the pop-view view of the term selector control.
	# Set the EQUELLA connection settings in settings.py and the
	# location of the CSV file and the taxonomy ID in the variables
	# below.
	#
	# NOTE:
	# This script requires the deleteTerm() method to be implemented
	# in equellaSoap.py module. This function is not present in the
	# equellaSoap.py module in the integration pack that ships with
	# EQUELLA 5.0 or earlier. Use the module provided with this
	# script.
	import csv
	import codecs
	import sys
	import traceback
	import time
	import equellasoap
	import argparse

	parser = argparse.ArgumentParser(description='Upload a taxonomy to VAULT.')
	parser.add_argument('--un', nargs='?', action='store', help='EQUELLA Username')
	parser.add_argument('--pw', nargs='?', action='store', required=True,
	help='EQUELLA Password')
	parser.add_argument('--tid', nargs='?', action='store', required=True,
	help='Taxonomy ID')
	parser.add_argument('--csv', nargs='?', action='store',
	help='Path to taxonomy CSV file')
	parser.add_argument('--clear', action='store_true',
	help='Clear the taxonomy (delete all prior terms)')
	parser.add_argument('--start', default=1, type=int,
	help='Row of the CSV to start at, e.g. use 2 to skip header row')

	args = parser.parse_args()

	csvFileName = args.csv # relative path to CSV file to load

	# Pearson's code below except for calls to args

	# ---- SETTINGS ---- #

	# EQUELLA parameters
	institutionUrl = 'https://vault.cca.edu'
	username = args.un
	password = args.pw

	# CSV unicode encoding
	# typically "utf-8" or "latin1" (Excel often saves non-Ascii CSV files as "latin1")
	encoding = "utf-8"
	# id of taxonomy to update (retrieve from Admin Console)
	taxonID = args.tid
	# set clearTaxonomy to True if taxonomy is to be cleared first
	clearTaxonomy = args.clear
	# which row in the CSV to start processing at (e.g. 1 to process entire CSV)
	startRow = args.start
	# set to true to only unlock taxonomy (no uploading)
	unlockOnly = False
	# set to true to only clear taxonomy (no uploading)
	clearOnly = False
	# set to true to cause script to retry on error (useful for poor network connections)
	retryOnError = False

	# optional EQUELLA parameters:
	proxyUrl = None
	useTokens = False
	tokenUser = 'tokenUsername'
	sharedSecretId = 'tokenSecretId'
	sharedSecretValue = 'tokenSecretValue'

	# --- END SETTINGS --- #


	def unicode_csv_reader(utf8_data, encoding, dialect=csv.excel, **kwargs):
	csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
	firstRow = True
	for row in csv_reader:
	# remove BOM for utf-8
	if firstRow:
	if row[0].startswith(codecs.BOM_UTF8):
	row[0] = row[0].decode("utf-8")[1:]
	firstRow = False

	yield [cell.decode(encoding) for cell in row]


	if not institutionUrl.endswith("/"):
	institutionUrl += "/"
	# test logging in and logging out of EQUELLA
	equella = equellasoap.EquellaSoap(institutionUrl, username, password, proxyUrl)
	equella.logout()
	# test unlocking taxonomy
	try:
	equella.unlockTaxonomy(taxonID, force=1)
	except:
	raise "Unable to unlock taxonomy, check taxonomy ID"
	# test opening and closing CSV
	csvFile = open(csvFileName, "rb")
	csvFile.close()

	rownum = 0
	lastAttemptedRow = 0
	lastAttemptedTerm = ""
	incomplete = True
	retrying = False
	while incomplete:
	try:

	# instantiate an EquellaSoap object and lock the taxonomy for editing
	print "Logging into EQUELLA..."
	equella = equellasoap.EquellaSoap(institutionUrl, username, password, proxyUrl)
	equella.unlockTaxonomy(taxonID, force=1)
	if unlockOnly:
	print "Taxonomy successfully unlocked."
	sys.exit("Unlock taxonomy only")

	print "Locking taxonomy for editing..."
	equella.lockTaxonomyForEditing(taxonID)

	# clear the taxonomy in EQUELLA
	if clearTaxonomy or clearOnly:
	print "Clearing taxonomy..."
	rootTerms = equella.listTerms(taxonID, "")
	for rootTerm in rootTerms:
	equella.deleteTerm(taxonID, rootTerm)
	if clearOnly:
	print "Taxonomy successfully cleared."
	sys.exit("Clear taxonomy only")

	# read CSV file
	print "Importing taxonomy from CSV starting from row %s..." % (startRow)
	csvFile = open(csvFileName, "rb")
	reader = csv.reader(csvFile, dialect=csv.excel)

	# get row count
	rowcount = len(list(reader))
	csvFile.close()
	csvFile = open(csvFileName, "rbU")
	reader = unicode_csv_reader(csvFile, encoding)

	# remove last term in case it was only partially completed
	if lastAttemptedRow > 0:
	try:
	if len(equella.listTerms(taxonID, lastAttemptedTerm)) == 0:
	print "Deleting " + lastAttemptedTerm + " to reimport..."
	equella.deleteTerm(taxonID, lastAttemptedTerm)
	except:
	pass

	# iterate through csv rows
	for row in reader:
	rownum += 1
	if rownum >= startRow:

	lastAttemptedRow = rownum

	print "Processing row %s of %s" % (rownum, rowcount)
	terms = row[0].split("\\")
	parentTerm = ""
	# iterate through terms of path
	termsAdded = False
	for i in range(0, len(terms)):
	term = terms[i].strip()

	fullTermPath = ""
	if parentTerm == "":
	fullTermPath = term
	else:
	fullTermPath = parentTerm + "\\" + term

	lastAttemptedTerm = fullTermPath

	# if term deosn't exist add it
	siblingTerms = equella.listTerms(taxonID, parentTerm)
	if term not in siblingTerms:

	termsAdded = True
	print " adding " + fullTermPath
	equella.insertTerm(taxonID, parentTerm, term)
	# Add data to term if it is a leaf node
	for j in range(1, len(row), 2):
	key = row[j].strip()
	if i == len(terms) - 1 and key != "":
	print " adding data key: " + key
	equella.setTermData(taxonID, fullTermPath, key, row[j + 1].strip())

	# set current term as parent for next row
	# (i.e. term) in the CSV
	parentTerm = fullTermPath

	if not termsAdded:
	print " no new terms to be added"

	incomplete = False

	except:
	exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
	error = ''.join(traceback.format_exception(exceptionType, exceptionValue, exceptionTraceback))
	if "Clear taxonomy only" in error or "Unlock taxonomy only" in error:
	incomplete = False
	elif retryOnError:
	# retry
	print "Error occured: " + str(exceptionValue)
	print " Last Attempted Term: " + lastAttemptedTerm
	print " Last Attempted Row: " + str(lastAttemptedRow)
	print " Retrying..."
	time.sleep(2)

	clearTaxonomy = False
	startRow = lastAttemptedRow
	rownum = 0
	else:
	raise Exception, error


	# unlock the taxonomy, logout of EQUELLA and close the csv file
	equella.unlockTaxonomy(taxonID)
	equella.logout()
	if not clearOnly and not unlockOnly:
	csvFile.close()

	print "-- PROCESSING COMPLETE --"