Created
May 27, 2016 16:41
-
-
Save jeffgerhard/e4938f61dcd400c8cb862f2c5dc8fa1f to your computer and use it in GitHub Desktop.
prepare PDFs and matching MARC records for uploading to the Internet Archive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ################################################## | |
# | |
# python3 program to prepare for Internet Archive upload of PDFs and MARC records | |
# this will do the following: | |
# 1. create a list of identifiers from a folder of PDFs; | |
# 2. make sure MARC records exist with the same identifiers (stopping on mismatches) | |
# 3. pull MARC records into the same directory and rename them to IA convention | |
# 4. generate a spreadsheet called upload.csv to use with IA command-line tool | |
# | |
# TO DO: | |
# - query database for volume information and also update database on completion | |
# - move to pure Python uploading to eliminate extra steps (no spreadsheet even needed), | |
# and enable file cleanup when done | |
# - generally clean up and make more robust (support XML for MARC, etc) | |
# | |
# ################################################## | |
from tkinter.filedialog import askdirectory | |
import os | |
import shutil | |
dirname = askdirectory(title='Choose a directory of PDFs',initialdir='Z:\\holding\\books\\IA_upload\\') | |
def listPDFs(dirname): | |
list = [] | |
for file in os.listdir(dirname): | |
if file.endswith('.pdf'): | |
list.append(file[:-4]) | |
return list | |
def checkmrcs(list): | |
rtn = True | |
for file in list: | |
if not os.path.isfile('Z:\\holding\\books\\MARC_records\\' + file + '.mrc'): | |
print('No matching MARC record for ' + file) | |
rtn = False | |
return rtn | |
def prepareupload(list): | |
filename = dirname + '\\upload.csv' | |
csvremainder=".pdf,,Digitized at Georgetown University Law Library,georgetown-university-law-library-rr,true,texts\n" | |
with open(filename,mode = 'w',encoding = 'utf-8') as fh: | |
fh.write('identifier,file,volume,description,collection,noindex,mediatype\n') | |
for f in list: | |
shutil.copy('Z:\\holding\\books\\MARC_records\\' + f + '.mrc',dirname + '\\' + f + '_meta.mrc') | |
fh.write(f + ',' + f + csvremainder) | |
fh.write(',' + f + '_meta.mrc,,,,,\n') | |
list = listPDFs(dirname) | |
if checkmrcs(list): | |
accessfile = dirname + '\\_filelist.txt' | |
with open (accessfile,mode = 'w') as fh: | |
for file in list: | |
fh.write(file + '\n') | |
prepareupload(list) | |
print('Ready to upload. Load filelist.txt into Access and manually add any volume numbers to the spreadshet before running uploader.\n\nWhen ready, the IA command-line tool should work (if installed) by running this command: ia upload --spreadsheet=upload.csv') | |
else: | |
print('Fix all MARC record/PDF mismatches before continuing') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment