rdrg109/crawler.py

## crawler.py
from bs4 import BeautifulSoup
import requests
import csv
import json
import os.path

# This file contains the hymnary.org identifiers that we are
# interested in retrieving their data.

file = 'input.txt'

# This variable will store all the data that will be saved in the file output.csv

hymns = []

# Read the identifiers and the names from a CSV file

with open(file, mode='r') as file:
  for line in file:
    hymn = {}
    hymn['hymnary_id'] = line.strip()
    hymns.append(hymn)

# Download the websites of the hymnaries as HTML files

for hymn in hymns:
  filename = hymn['hymnary_id'] + '.html'

  # Don't try to download if the HTML already exists

  if os.path.isfile(filename):
    continue

  url = 'https://hymnary.org/tune/' + hymn['hymnary_id']

  print('url: ', url)

  r = requests.get(url, allow_redirects=True)

  print(r.status_code)

  if r.status_code == 200:
    with open(filename, 'wb') as file:
      file.write(r.content)
  else:
    with open(filename, 'wb') as file:
      file.write('')

# Retrieve the data from the downloaded HTML files

for hymn in hymns:
  filename = hymn['hymnary_id'] + '.html'

  with open(filename, 'r') as f:
    contents = f.read()

  soup = BeautifulSoup(contents, 'lxml')

  tune_info = soup.find('div', {'id': 'at_tuneinfo'})
  tune_info_data = tune_info.find_all('tr', {'class': 'result-row'})

  # A loop that will iterate through all the data that is stored in
  # the "Tune information" section of each tune.

  for row in tune_info_data:
    label = row.find('span', {'class': 'hy_infoLabel'}).text
    label = label[:label.find(':')]
    label = label.strip()
    data = row.find('span', {'class': 'hy_infoItem'}).text
    data = data.strip()
    hymn[label] = data

  general_info = soup.find('div', {'id': 'authority_above_fold'}).find_all('span')[1]

  instances = general_info.find('a', {'href': '#instances'}).text

  hymn['Published in'] = instances

# Collect the keys in all dictionaries, so that all the data can be
# dumped to a CSV file with the correct format

keys = set().union(*(d.keys() for d in hymns))

# Write the data to the CSV file

with open('output.csv', 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(hymns)
	from bs4 import BeautifulSoup
	import requests
	import csv
	import json
	import os.path

	# This file contains the hymnary.org identifiers that we are
	# interested in retrieving their data.

	file = 'input.txt'

	# This variable will store all the data that will be saved in the file output.csv

	hymns = []

	# Read the identifiers and the names from a CSV file

	with open(file, mode='r') as file:
	for line in file:
	hymn = {}
	hymn['hymnary_id'] = line.strip()
	hymns.append(hymn)

	# Download the websites of the hymnaries as HTML files

	for hymn in hymns:
	filename = hymn['hymnary_id'] + '.html'

	# Don't try to download if the HTML already exists

	if os.path.isfile(filename):
	continue

	url = 'https://hymnary.org/tune/' + hymn['hymnary_id']

	print('url: ', url)

	r = requests.get(url, allow_redirects=True)

	print(r.status_code)

	if r.status_code == 200:
	with open(filename, 'wb') as file:
	file.write(r.content)
	else:
	with open(filename, 'wb') as file:
	file.write('')

	# Retrieve the data from the downloaded HTML files

	for hymn in hymns:
	filename = hymn['hymnary_id'] + '.html'

	with open(filename, 'r') as f:
	contents = f.read()

	soup = BeautifulSoup(contents, 'lxml')

	tune_info = soup.find('div', {'id': 'at_tuneinfo'})
	tune_info_data = tune_info.find_all('tr', {'class': 'result-row'})

	# A loop that will iterate through all the data that is stored in
	# the "Tune information" section of each tune.

	for row in tune_info_data:
	label = row.find('span', {'class': 'hy_infoLabel'}).text
	label = label[:label.find(':')]
	label = label.strip()
	data = row.find('span', {'class': 'hy_infoItem'}).text
	data = data.strip()
	hymn[label] = data

	general_info = soup.find('div', {'id': 'authority_above_fold'}).find_all('span')[1]

	instances = general_info.find('a', {'href': '#instances'}).text

	hymn['Published in'] = instances

	# Collect the keys in all dictionaries, so that all the data can be
	# dumped to a CSV file with the correct format

	keys = set().union(*(d.keys() for d in hymns))

	# Write the data to the CSV file

	with open('output.csv', 'w', newline='') as output_file:
	dict_writer = csv.DictWriter(output_file, keys)
	dict_writer.writeheader()
	dict_writer.writerows(hymns)