Skip to content

Instantly share code, notes, and snippets.

@tshrinivasan
Created October 10, 2023 19:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tshrinivasan/e5844e69b42222f71ae56309afe261f2 to your computer and use it in GitHub Desktop.
Save tshrinivasan/e5844e69b42222f71ae56309afe261f2 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import json
# In[2]:
# Load the jsonl file into a dataframe
jsonl_metadata = pd.read_json(path_or_buf="tamil.only", lines=True)
df = pd.DataFrame(jsonl_metadata)
# In[3]:
df.head(2)
# In[4]:
# list of metadata records
metadata_records = df["metadata"].tolist()
df2 = pd.DataFrame(metadata_records)
# In[5]:
df2.head(2)
# In[6]:
# We are interested the actual metadata records, ignore other admin metadata
metadata_records = df2["record"].tolist()
df3 = pd.DataFrame(metadata_records)
# In[7]:
df3.head(2)
# In[8]:
# Export the metadata records as csv
df3.to_csv("tamil_worldcat_metadata.csv", index=False)
# In[10]:
## Basic normalization of publication date
new_date_list = []
for current_date in df3["publicationDate"]:
try:
current_date = current_date.replace("[", "")
current_date = current_date.replace("]", "")
current_date = current_date.replace("-", "")
current_date = current_date.replace("-", "")
current_date = current_date.replace("©", "")
if len(current_date) != 4:
current_date = "9999"
new_date = int(current_date)
new_date_list.append(new_date)
except:
new_date_list.append("9999")
# In[11]:
df3["date"] = new_date_list
df3["date"] = pd.to_numeric(df3["date"])
# In[12]:
sorted_df = df3.sort_values(by=['date'], ascending=True)
# In[13]:
sorted_df.head(2)
# In[14]:
df4 = sorted_df.query("date >= 1900 and date < 1951")
df4 = df4.drop_duplicates("oclcNumber" , keep='first')
# In[15]:
df4.to_csv("tamil_metadata_1900_1950s.csv", index=False)
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment