Skip to content

Instantly share code, notes, and snippets.

@jeffgerhard
Created May 27, 2016 16:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeffgerhard/e4938f61dcd400c8cb862f2c5dc8fa1f to your computer and use it in GitHub Desktop.
Save jeffgerhard/e4938f61dcd400c8cb862f2c5dc8fa1f to your computer and use it in GitHub Desktop.
prepare PDFs and matching MARC records for uploading to the Internet Archive
# ##################################################
#
# python3 program to prepare for Internet Archive upload of PDFs and MARC records
# this will do the following:
# 1. create a list of identifiers from a folder of PDFs;
# 2. make sure MARC records exist with the same identifiers (stopping on mismatches)
# 3. pull MARC records into the same directory and rename them to IA convention
# 4. generate a spreadsheet called upload.csv to use with IA command-line tool
#
# TO DO:
# - query database for volume information and also update database on completion
# - move to pure Python uploading to eliminate extra steps (no spreadsheet even needed),
# and enable file cleanup when done
# - generally clean up and make more robust (support XML for MARC, etc)
#
# ##################################################
from tkinter.filedialog import askdirectory
import os
import shutil
dirname = askdirectory(title='Choose a directory of PDFs',initialdir='Z:\\holding\\books\\IA_upload\\')
def listPDFs(dirname):
list = []
for file in os.listdir(dirname):
if file.endswith('.pdf'):
list.append(file[:-4])
return list
def checkmrcs(list):
rtn = True
for file in list:
if not os.path.isfile('Z:\\holding\\books\\MARC_records\\' + file + '.mrc'):
print('No matching MARC record for ' + file)
rtn = False
return rtn
def prepareupload(list):
filename = dirname + '\\upload.csv'
csvremainder=".pdf,,Digitized at Georgetown University Law Library,georgetown-university-law-library-rr,true,texts\n"
with open(filename,mode = 'w',encoding = 'utf-8') as fh:
fh.write('identifier,file,volume,description,collection,noindex,mediatype\n')
for f in list:
shutil.copy('Z:\\holding\\books\\MARC_records\\' + f + '.mrc',dirname + '\\' + f + '_meta.mrc')
fh.write(f + ',' + f + csvremainder)
fh.write(',' + f + '_meta.mrc,,,,,\n')
list = listPDFs(dirname)
if checkmrcs(list):
accessfile = dirname + '\\_filelist.txt'
with open (accessfile,mode = 'w') as fh:
for file in list:
fh.write(file + '\n')
prepareupload(list)
print('Ready to upload. Load filelist.txt into Access and manually add any volume numbers to the spreadshet before running uploader.\n\nWhen ready, the IA command-line tool should work (if installed) by running this command: ia upload --spreadsheet=upload.csv')
else:
print('Fix all MARC record/PDF mismatches before continuing')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment