Skip to content

Instantly share code, notes, and snippets.

@rlskoeser
Last active May 11, 2021 20:02
Show Gist options
  • Save rlskoeser/abdfa74c9f06c065b70dc147e7cd3ef6 to your computer and use it in GitHub Desktop.
Save rlskoeser/abdfa74c9f06c065b70dc147e7cd3ef6 to your computer and use it in GitHub Desktop.
quick script to get unique sources from Cambridge Geniza bibliography
# pip install pandas rispy
import pandas as pd
import rispy
# download RIS file: https://www.repository.cam.ac.uk/handle/1810/256117
# parse RIS file into entries
with open('genizahbibliography20160203.txt') as bibfile:
entries = rispy.load(bibfile)
print('Loaded %d entries' % len(entries))
# load as dataframe
df = pd.DataFrame(data=entries)
# convert author list into string
df['author_list'] = df.authors.apply(lambda x: '; '.join(x))
# create copy dataframe, limited to fields consistent per source
source_df = df[['author_list', 'year', 'short_title', 'title', 'volume', 'place_published']].copy()
# drop duplicates to get the set of unique sources
uniq_sources = source_df.drop_duplicates().copy()
print('%d unique sources' % len(uniq_sources))
# get a list of shelfmarks from the original dataframe for each source
def shelfmarks_for_source(row):
return '; '.join(list(df[(df.author_list == row.author_list) & (df.title == row.title) & (df.year == row.year)].label.unique()))
print('Aggregating shelfmarks ...')
uniq_sources['shelfmarks'] = uniq_sources.apply(shelfmarks_for_source, axis=1)
# rename author column and save as csv
uniq_sources.rename(columns={'author_list': 'authors'}).to_csv('genizahbibliography20160203_sources_shelfmarksv2.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment