rlskoeser/genizabibliography_sources_ris2csv.py

## genizabibliography_sources_ris2csv.py
# pip install pandas rispy
import pandas as pd
import rispy

# download RIS file: https://www.repository.cam.ac.uk/handle/1810/256117

# parse RIS file into entries
with open('genizahbibliography20160203.txt') as bibfile:
    entries = rispy.load(bibfile)

print('Loaded %d entries' % len(entries))
# load as dataframe
df = pd.DataFrame(data=entries)

# convert author list into string
df['author_list'] = df.authors.apply(lambda x: '; '.join(x))

# create copy dataframe, limited to fields consistent per source
source_df = df[['author_list', 'year', 'short_title', 'title', 'volume', 'place_published']].copy()

# drop duplicates to get the set of unique sources
uniq_sources = source_df.drop_duplicates().copy()
print('%d unique sources' % len(uniq_sources))

# get a list of shelfmarks from the original dataframe for each source
def shelfmarks_for_source(row):
    return '; '.join(list(df[(df.author_list == row.author_list) & (df.title == row.title) & (df.year == row.year)].label.unique()))

print('Aggregating shelfmarks ...')
uniq_sources['shelfmarks'] = uniq_sources.apply(shelfmarks_for_source, axis=1)

# rename author column and save as csv
uniq_sources.rename(columns={'author_list': 'authors'}).to_csv('genizahbibliography20160203_sources_shelfmarksv2.csv', index=False)
	# pip install pandas rispy
	import pandas as pd
	import rispy

	# download RIS file: https://www.repository.cam.ac.uk/handle/1810/256117

	# parse RIS file into entries
	with open('genizahbibliography20160203.txt') as bibfile:
	entries = rispy.load(bibfile)

	print('Loaded %d entries' % len(entries))
	# load as dataframe
	df = pd.DataFrame(data=entries)

	# convert author list into string
	df['author_list'] = df.authors.apply(lambda x: '; '.join(x))

	# create copy dataframe, limited to fields consistent per source
	source_df = df[['author_list', 'year', 'short_title', 'title', 'volume', 'place_published']].copy()

	# drop duplicates to get the set of unique sources
	uniq_sources = source_df.drop_duplicates().copy()
	print('%d unique sources' % len(uniq_sources))

	# get a list of shelfmarks from the original dataframe for each source
	def shelfmarks_for_source(row):
	return '; '.join(list(df[(df.author_list == row.author_list) & (df.title == row.title) & (df.year == row.year)].label.unique()))

	print('Aggregating shelfmarks ...')
	uniq_sources['shelfmarks'] = uniq_sources.apply(shelfmarks_for_source, axis=1)

	# rename author column and save as csv
	uniq_sources.rename(columns={'author_list': 'authors'}).to_csv('genizahbibliography20160203_sources_shelfmarksv2.csv', index=False)