Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Scripts for Python Programs to Start Digging Into JSTOR’s Early Journals
# PROGRAM NAME: journal_list.py
# Displays the title of every journal in the JSTOR Early Journal Content Data Bundle.
# Highlights those with "Psych" or "Philos" in the title (+ two general science journals).
# Displays the filename of the first article for each journal.
# Written by Christopher D Green
# January 2014
# Modified by (if you modify this program, put your name here.)
# Date Modified (if you modify this program, put the date of the modification here.)
#START OF PROGRAM
import os #You need to import this in order to read the filenames from the disk below.
#Below is the directory path to where I am keeping the files on my computer.
#You will need to change it to suit your own system.
dirname='/Users/chriso/Desktop/JSTOR-journals/bundle/'
#Puts list of filename in the variable "filenames"
filenames=os.listdir(dirname)
#Initializing a few variables.
contents=''; title=''; lasttitle1=''; lasttitle2=''; lasttitle3=''; i=0
for i in range(len(filenames)): #Looping through each file...
if filenames[i]!='.DS_Store': #Don't read this file (which causes Python to crash)
filename=dirname+filenames[i] #Create the full path to ech file
file=open(filename,'r', encoding='utf-8') #Open the file for reading (in unicode)
contents=file.read() #Put the entire contents of the file in a variable called "contents"
file.close() #Close the file (so you don't overwrite it later)
if '<journaltitle>' in contents: #If the file comes from a journal...
start=contents.find('<journaltitle>')+14 #Find the start of the journal's title
end=contents.find('</journaltitle>') #Find the end of the journal's title
title=contents[start:end] #Put everything in between in a variable called "title"
#First, check that you haven't seen this title recently
if title!=lasttitle1 and title!=lasttitle2 and title!=lasttitle3:
#Then, check if the journal title is one of those in which you are particularly interested.
if 'Psych' in title or 'Philos' in title or 'Monist' in title or title=='Science' or 'Scientific Monthly' in title:
print()
print('***',title, filenames[i][8:]) #If it is, highlight the title on the screen.
print()
else:
print(title, filenames[i][8:]) #Display other journal title on the screen too.
lasttitle3=lasttitle2 #Move your recently-seen titles up the line.
lasttitle2=lasttitle1
lasttitle1=title
#END OF PROGRAM
#PROGAM NAME: move_psychphil.py (ver 1.0)
# Extracts article titles, author names, volume numbers, and page numbers
# for Psych, Philos, and some science journals in the JSTOR EARLY JOURNAL
# CONTENT DATA BUNDLE. Displays these on screen and creates file names to
# copy them to their own folder (but does not actually do the copying).
# Also creates .csv list of articles from The Monist only.
# Written by Christopher D Green
# January 2014
# Modified April 2014 to not move files, but just list their original file names in the Excel file.
# START PROGRAM
import os #needed to navigate directory tree of resident harddisk
#set from and to directories
fromdirname='/Users/chriso/Desktop/JSTOR-journals/bundle/'
todirname='/Users/chriso/Desktop/JSTOR-journals/temp/'
#read filenames from directory
filenames=os.listdir(fromdirname)
#initialize some variables
contents=''; title=''; lasttitle1=''; lasttitle2=''; lasttitle3=''; i=0
jnum=-1; volnum='0'; pnum='0'
#setting up csv file that lists authors, titles, etc. for Phil Review articles
listfile=open(todirname+'Monist.csv', 'w')
listfile.write('year,vol,page,author,title\n')
listfile.close
#list of desired journals and the abbreviations to be used in filenames
journals=[['The Journal of Philosophy, Psychology and Scientific Methods',
'The Philosophical Review', 'The Monist'],
['JPPSM','PhilRev','Monist']]
#other interesting journals: The American Journal of Psychology, Science,
# The Scientific Monthly, The Journal of Speculative Philosophy
for i in range(len(filenames)):
if filenames[i]!='.DS_Store':
filename=fromdirname+filenames[i]
#print(filenames[i])
file=open(filename,'r', encoding='utf-8')
contents=file.read()
file.close()
if '<journaltitle>' in contents:
start=contents.find('<journaltitle>')+14 #find start of journal title
end=contents.find('</journaltitle>') #find end of journal title
jtitle=contents[start:end] #get journal title
if jtitle in journals[0]: #is it one of the journal I want?
jnum=journals[0].index(jtitle)
typeloc=contents.find('<type>') #is it a full article?
if contents[typeloc+6:typeloc+9]=='fla':
#print ('.', end='')
print(jtitle)
volstart=contents.find('<volume>')+8 #find start of volume number
volend=contents.find('</volume>') #find end of volume number
volnum=contents[volstart:volend] #get volume number
pstart=contents.find('<fpage>')+7 #find start of page number
pend=contents.find('</fpage>') #find end of page number
pnum=contents[pstart:pend] #get initial page number
yearstart=contents.find('<year>')+6 #find start of year
year=contents[yearstart:yearstart+4] #get volume number
atitlestart=contents.find('<title>')+7 #find start of article title
atitleend=contents.find('</title>') #find end of article title
atitle=contents[atitlestart:atitleend] #get article
while ',' in atitle:
commaloc=atitle.find(',') #find commmas in atitle...
atitle=atitle[:commaloc]+'`'+atitle[commaloc+1:] #...replace with `
print(atitle)
authorstart=contents.find('<authors>\n <list-item>')+25 #find start of author's name
authorend=''; k=0; author='' #finding the end of the author's name is a little more complex.
while '<' not in author: #the < marks the </list-item> tag at the end of the 1st author's name
author=author+contents[authorstart+k] #accumulate author's name character by character
k=k+1 #ideally, we would like to list all authors. Haven't done that here.
author=author[:-1] #removes the < at the end of the author's name
if author=='P. C.':
author='Paul Carus'
revauthor='';k=1; #putting lastname first
while len(author)-k>=0 and author[len(author)-k] != ' ': #finding the space
k=k+1 #between the first & last names
lastname=author[len(author)-k+1:]
lastname.lower(); lastname.capitalize() #setting Lastname
firstname=author[:len(author)-k+1] #(if it was all CAPS)
firstname.lower(); firstname.capitalize() #setting Firstname
#Occasionally names come out all in CAPS anyway. Check by hand.
revauthor=lastname+firstname
while ' ' in revauthor: #removing spaces
spaceloc=revauthor.find(' ')
revauthor=revauthor[:spaceloc]+revauthor[spaceloc+1:]
while '.' in revauthor: #removing dots
dotloc=revauthor.find('.')
revauthor=revauthor[:dotloc]+revauthor[dotloc+1:]
#This process still fails on names with Jr. at the end.
#These must be corrected by hand at present.
if len(author) < 3:
revauthor='NoAuthor'
print(author)
#assemble name of file to which article will be saved
outfilename=journals[1][jnum]+'.'+volnum+'.'+pnum+'.txt'
fullpathname=todirname+outfilename
#print(fullpathname)
print()
fullpath=open(fullpathname, 'w', encoding='utf-8') #writes files to a new directory
fullpath.write(contents)
fullpath.close()
#print ('*', end='')
if jtitle=='The Monist': #Creates csv list of Monist articles
listfile=open(todirname+'Monist.csv', 'a', encoding='utf-8')
listfile.write(filenames[i]+','+year+','+volnum+','+pnum+','+revauthor+','+atitle+','+outfilename+'\n')
listfile.close()
if jtitle=='The Journal of Philosophy, Psychology and Scientific Methods': #Creates csv list of JPPSM articles
listfile=open(todirname+'JPPSM.csv', 'a', encoding='utf-8')
listfile.write(filenames[i]+','+year+','+volnum+','+pnum+','+revauthor+','+atitle+','+outfilename+'\n')
listfile.close()
if jtitle=='The Philosophical Review': #Creates csv list of Phil Rev articles
listfile=open(todirname+'PhilRev.csv', 'a', encoding='utf-8')
listfile.write(filenames[i]+','+year+','+volnum+','+pnum+','+revauthor+','+atitle+','+outfilename+'\n')
listfile.close()
# END PROGRAM
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment