Skip to content

Instantly share code, notes, and snippets.

@saverkamp
Created February 7, 2013 17:53
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save saverkamp/4732757 to your computer and use it in GitHub Desktop.
Save saverkamp/4732757 to your computer and use it in GitHub Desktop.
Sample script to harvest metadata through CONTENTdm v6 API and format as csv for upload into ui-libraries fork of Omeka/Scripto. See ui-libraries/plugin-Scripto for documentation. Uses pycdm, a python library for working with the CONTENTdm v6 API (saverkamp/pycdm).
import codecs
import csv
import datetime
import pycdm
from HTMLParser import HTMLParser
#get input: alias + items to retrieve
alias = raw_input('collection alias: ')
items = raw_input('item identifiers (separate by commas): ')
ptrs = items.split(',')
#current date-time for output filenames
today = datetime.datetime.now().strftime('%Y-%m-%d--%H-%M')
#create file-level metadata csv file
fileOutput = alias + today + '_File.csv'
fFile = codecs.open(fileOutput, 'wb', encoding='utf_8')
wtrFile = csv.writer(fFile, delimiter=',')
#header row for file-level csv file
fileHeaderRow = ['filename', 'title', 'identifier', 'source', 'status', 'transcription', 'Omeka file order']
wtrFile.writerow(fileHeaderRow)
#create item-level metadata csv file
itemOutput = alias + today + '_Item.csv'
fItem = codecs.open(itemOutput, 'wb', encoding='utf_8')
wtrItem = csv.writer(fItem, delimiter=',')
#header row for item-level csv file
itemHeaderRow = ['title', 'identifier', 'source', 'ispartof', 'relation', 'audience', 'files']
wtrItem.writerow(itemHeaderRow)
#get data for each item
for ptr in ptrs:
#call api for item metadata
item = pycdm.item(alias, ptr, 'on')
#set item-level metadata
#create unique item id for use outside CDM
itemID = alias + '_' + ptr
source = item.refurl
itemtitle = item.info['title']
#digital collection url
ispartof = item.collection.url
#default sorting number, maps to dc:Audience in Omeka
sort = '000000'
#collection guide url
if ('findin' in item.info):
relation = item.info['findin']
elif ('collea' in item.info):
relation = item.info['collea']
#list for file locations
files = []
#set counter for file order
order = 1
#set file-level metadata
for page in item.pages:
#create unique page id for use outside CDM
fileID = itemID + '_' + page.id
pagelabel = page.label
pageRefURL = page.refurl
#set transcription, if available
#assumes you have a field for full text with nickname 'full' or 'fula'
if (('full' in page.info) and page.info['full']):
transcription = str(page.info['full'].encode('ascii', 'ignore'))
transcription = HTMLParser().unescape(transcription)
elif (('fula' in page.info) and page.info['fula']):
transcription = str(page.info['fula'].encode('ascii', 'ignore'))
transcription = HTMLParser().unescape(transcription)
else:
transcription = ''
#set transcription status
if (transcription == ''):
status = 'Not Started'
else:
status = 'Needs Review'
url = page.fileurl
files.append(url)
#write file metadata to file-level csv file
filerow = [url, pagelabel, fileID, pageRefURL, status, transcription.encode('ascii', 'ignore'), order]
wtrFile.writerow(filerow)
order += 1
#write item metadata to item-level csv file
files = ','.join(files)
itemrow = [itemtitle, itemID, source, ispartof, relation, sort, files]
wtrItem.writerow(itemrow)
print ptr
fItem.close()
fFile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment