Skip to content

Instantly share code, notes, and snippets.

@rodrigomorales1
Last active March 5, 2022 17:59
Show Gist options
  • Save rodrigomorales1/da590935c1d80bed586901d895745000 to your computer and use it in GitHub Desktop.
Save rodrigomorales1/da590935c1d80bed586901d895745000 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import csv
import json
import os.path
# This file contains the hymnary.org identifiers that we are
# interested in retrieving their data.
file = 'input.txt'
# This variable will store all the data that will be saved in the file output.csv
hymns = []
# Read the identifiers and the names from a CSV file
with open(file, mode='r') as file:
for line in file:
hymn = {}
hymn['hymnary_id'] = line.strip()
hymns.append(hymn)
# Download the websites of the hymnaries as HTML files
for hymn in hymns:
filename = hymn['hymnary_id'] + '.html'
# Don't try to download if the HTML already exists
if os.path.isfile(filename):
continue
url = 'https://hymnary.org/tune/' + hymn['hymnary_id']
print('url: ', url)
r = requests.get(url, allow_redirects=True)
print(r.status_code)
if r.status_code == 200:
with open(filename, 'wb') as file:
file.write(r.content)
else:
with open(filename, 'wb') as file:
file.write('')
# Retrieve the data from the downloaded HTML files
for hymn in hymns:
filename = hymn['hymnary_id'] + '.html'
with open(filename, 'r') as f:
contents = f.read()
soup = BeautifulSoup(contents, 'lxml')
tune_info = soup.find('div', {'id': 'at_tuneinfo'})
tune_info_data = tune_info.find_all('tr', {'class': 'result-row'})
# A loop that will iterate through all the data that is stored in
# the "Tune information" section of each tune.
for row in tune_info_data:
label = row.find('span', {'class': 'hy_infoLabel'}).text
label = label[:label.find(':')]
label = label.strip()
data = row.find('span', {'class': 'hy_infoItem'}).text
data = data.strip()
hymn[label] = data
general_info = soup.find('div', {'id': 'authority_above_fold'}).find_all('span')[1]
instances = general_info.find('a', {'href': '#instances'}).text
hymn['Published in'] = instances
# Collect the keys in all dictionaries, so that all the data can be
# dumped to a CSV file with the correct format
keys = set().union(*(d.keys() for d in hymns))
# Write the data to the CSV file
with open('output.csv', 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(hymns)
@rodrigomorales1
Copy link
Author

This code is released under CC0.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment