Skip to content

Instantly share code, notes, and snippets.

@SYZYGY-DEV333
Created January 4, 2021 17:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SYZYGY-DEV333/56ae34f08c6e3c0e8018d5e80ec3d80d to your computer and use it in GitHub Desktop.
Save SYZYGY-DEV333/56ae34f08c6e3c0e8018d5e80ec3d80d to your computer and use it in GitHub Desktop.
A really crude scraper for pipeorgandatabase.org
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Pipe Organ Database Stoplist Scraper
license = '''
Copyright 2021 Joshua Sobel
Permission is hereby granted, free of charge, to any person obtaining a copy of this software
and associated documentation files (the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial
portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
'''
import argparse
import sys
from lxml import html
import html as h
import requests
import yaml
from random import randint
### Tell the program to do some stuff
parser = argparse.ArgumentParser(description='Grab data from the OHS Pipe Organ Database.')
parser.add_argument("--max", action='store_true', help="Display the maximum ID number")
parser.add_argument("-i", "--id", type=int, help="Get organ information about the organ with this ID.")
parser.add_argument("-r", "--random", type=int, help="Get information about N random organs.")
parser.add_argument("-f", "--rand-with-info", type=int, help="Get information about N random organs; only choose organs with specified data fields satisfied.")
parser.add_argument("-a", "--all", action='store_true', help="Output all available info for this organ.")
parser.add_argument("-c", "--csv", action='store_true', help="Display output info in CSV format")
parser.add_argument("--id-out", action='store_true', help="Output the ID for this organ.")
parser.add_argument("--url", action='store_true', help="Output the URL for this organ.")
parser.add_argument("--name", action='store_true', help="Output the location name for this organ.")
parser.add_argument("--type", action='store_true', help="Output the location type for this organ.")
parser.add_argument("--address", action='store_true', help="Output the location address for this organ.")
parser.add_argument("--city", action='store_true', help="Output the city name for this organ.")
parser.add_argument("--state", action='store_true', help="Output the state/province this organ is in")
parser.add_argument("--country", action='store_true', help="Output the country this organ is in.")
parser.add_argument("--builder", action='store_true', help="Output the builder info for this organ.")
parser.add_argument("--opus", action='store_true', help="Output the opus number for this organ.")
parser.add_argument("--year", action='store_true', help="Output the build year for this organ.")
parser.add_argument("--ranks", action='store_true', help="Output the number of ranks for this organ.")
parser.add_argument("--stops", action='store_true', help="Output the number of stops for this organ.")
parser.add_argument("--manuals", action='store_true', help="Output the number of manuals for this organ.")
parser.add_argument("--divisions", action='store_true', help="Output the number of divisions for this organ.")
parser.add_argument("--registers", action='store_true', help="Output the number of registers for this organ.")
parser.add_argument("--stoplist", action='store_true', help="Output the stoplist for this organ.")
parser.add_argument("--lowest-pitch", action='store_true', help="Output the lowest pitched stop of this organ.")
parser.add_argument("--altered", action='store_true', help="Display whether this organ has been altered")
args = parser.parse_args()
def getOrganData(organ_id):
# Initialize data structure
organ_data = {
"organ_id": None, # Number
"database_url": None, # String
"location": {
"name": None, # String
"type": None, # String, but all churches are "church"
"address": None, # String
"city": None, # String
"state_province": None, # String
"country": None, # String
},
"builder": None, # String
"opus": None, # String
"year": None, # String
"ranks": None, # Number
"stops": None, # Number
"manuals": None, # Number
"divisions": None, # Number
"registers": None, # Number
"stoplist": None, # Formatted String
"misc": {
"lowest_pitch": None, # Number
"altered": "False", # Boolean-string
}
}
### Get webpage and build element tree ###
page = requests.get('https://pipeorgandatabase.org/organ/' + organ_id)
tree = html.fromstring(page.content)
# organ_id
organ_data["organ_id"] = int(organ_id)
# database_url
organ_data["database_url"] = 'https://pipeorgandatabase.org/organ/' + organ_id
# location
loc_raw = list(map((lambda i: ' '.join(i.strip().split())), tree.xpath('//p[@class="card-text"]/text()')[:3]))
try:
if len(loc_raw[-1:][0]) == 0:
loc_raw = loc_raw[:-1]
elif not ',' in loc_raw[-1:][0]:
loc_raw = loc_raw[:-1]
except: pass
# name
try: organ_data["location"]["name"] = loc_raw[0]
except: pass
# type
if "location type" in str(page.content):
loc_type_raw = h.unescape(' '.join(str(page.content).split("location type is: ")[1].split("</li>")[0].split("\\n")[0].split()))
if 'Church' in loc_type_raw: # Comment this stuff out if you want to know the particular type of church
organ_data["location"]["type"] = "Church" # |
else: # |
organ_data["location"]["type"] = loc_type_raw # <--
# address
if len(loc_raw) == 3:
organ_data["location"]["address"] = loc_raw[1]
# city
try: organ_data["location"]["city"] = loc_raw[-1:][0].split(",")[0]
except: pass
# state_province
try: organ_data["location"]["state_province"] = loc_raw[-1:][0].split(", ")[1].split(" ")[0]
except: pass
# country
try: organ_data["location"]["country"] = loc_raw[-1:][0].split(" ")[-1:][0]
except: pass
# builder
if 'a href="/builder/' in str(page.content):
raw = tree.xpath('//a[@class="organ-title text-dark"]/text()')[0].strip()
organ_data["builder"] = h.unescape(raw.split(" (")[0])
# opus
if 'Opus' in raw:
organ_data["opus"] = h.unescape(raw.split("Opus ")[1].split(",")[0])
# year
try:
organ_data["year"] = h.unescape(raw.split(")")[0].split("Opus ")[1].split(", ")[1])
except: pass
elif '(' in raw:
organ_data["year"] = h.unescape(raw.split("(")[1].split(")")[0])
# ranks
try:
details_tree = str(page.content).split("Technical Details:</h4>")[1].split("</div>")[0].rsplit("<li>", 1)[1].split("</li>", 1)[0]
details = ' '.join(details_tree.split()).replace("\\n", "").split(".")[:-1]
except: details = []
details_bank = {}
for s in details:
k = s.strip().split(" ")[1]
n = s.strip().split(" ")[0]
details_bank[k] = n
if "ranks" in details_bank: organ_data["ranks"] = int(details_bank["ranks"])
# stops
if "stops" in details_bank: organ_data["stops"] = int(details_bank["stops"])
# manuals
if "manuals" in details_bank: organ_data["manuals"] = int(details_bank["manuals"])
# divisions
if "divisions" in details_bank: organ_data["divisions"] = int(details_bank["divisions"])
# registers
if "registers" in details_bank: organ_data["registers"] = int(details_bank["registers"])
# stoplist
if 'a href="/stoplist/' in str(page.content):
stoplist_number = str(page.content).split('href="/stoplist/')[1].split('"')[0]
stoplist_page = requests.get("https://pipeorgandatabase.org/stoplist/" + stoplist_number)
try:
stoplist_tree = html.fromstring(stoplist_page.content)
organ_data["stoplist"] = ("\n" + h.unescape(stoplist_tree.xpath('//pre/text()')[0]).replace("\u2019", "'") + "\n")
except: pass
# lowest_pitch
if organ_data["stoplist"]:
if "64'" in organ_data["stoplist"]: organ_data["misc"]["lowest_pitch"] = 64
elif "32'" in organ_data["stoplist"]: organ_data["misc"]["lowest_pitch"] = 32
elif "16'" in organ_data["stoplist"] or "16" in organ_data["stoplist"]: organ_data["misc"]["lowest_pitch"] = 16
elif "8'" in organ_data["stoplist"] or "8" in organ_data["stoplist"]: organ_data["misc"]["lowest_pitch"] = 8
# altered
if '<h5 class="organ-subtitle text-secondary">' and "Originally" in str(page.content):
organ_data["misc"]["altered"] = "True"
elif "The organ has been altered from its original state." in str(page.content):
organ_data["misc"]["altered"] = "True"
return organ_data
### Helper Function to get the max id for an organ
def getMaxID():
list_page = requests.get("https://pipeorgandatabase.org/organs?sort=addedNewest&extant=False&listView=True")
first_row = str(list_page.content).split("<tr>", 2)[2].split("</tr>", 1)[0]
maxID = int(first_row.split('<a href="/organ/')[1].split('"', 1)[0])
return maxID
### Helper Function to flatten dictionary
def flatten(d):
out = {}
for key, val in d.items():
if isinstance(val, dict):
val = [val]
if isinstance(val, list):
for subdict in val:
deeper = flatten(subdict).items()
out.update({key + '_' + key2: val2 for key2, val2 in deeper})
else:
out[key] = val
return out
### Print in default (YAML) or CSV formats.
def dataPrintInit():
if args.all and args.csv:
print("organ_id\tdatabase_url\tbuilder\topus\tyear\tranks\tstops\tmanuals\tdivisions\tregisters\tstoplist\tloc_name\tloc_type\tloc_address\tloc_city\tloc_state_province\tloc_country\tmisc_lowest_pitch\tmisc_altered")
elif args.csv:
columns = []
if args.id_out: columns.append("organ_id")
if args.url: columns.append("database_url")
if args.builder: columns.append("builder")
if args.opus: columns.append("opus")
if args.year: columns.append("year")
if args.ranks: columns.append("ranks")
if args.stops: columns.append("stops")
if args.manuals: columns.append("manuals")
if args.divisions: columns.append("divisiona")
if args.registers: columns.append("registers")
if args.stoplist: columns.append("stoplist")
if args.name: columns.append("loc_name")
if args.type: columns.append("loc_type")
if args.address: columns.append("loc_address")
if args.city: columns.append("loc_city")
if args.state: columns.append("loc_state_province")
if args.country: columns.append("loc_country")
if args.lowest_pitch: columns.append("misc_lowest_pitch")
if args.altered: columns.append("misc_altered")
print("\t".join(columns))
def dataPrint(d):
if not args.csv:
print(yaml.dump(d, width=20000, allow_unicode=True, sort_keys=False, default_flow_style=False).replace("\\n", "\n").replace("\\t", "\t"))
else:
new_d = flatten(d)
proto_lst = []
for k in new_d:
item = new_d[k]
if isinstance(item, str): item = ('"' + item + '"')
proto_lst.append(new_d[k])
print("\t".join(list(map(str, proto_lst))))
### Prints whatever data you tell it to print
def printOrganData(org_id, display):
info = getOrganData(str(org_id))
if args.all and display:
dataPrint(info)
return info
else:
info_to_print = {}
location_to_print = {}
misc_to_print = {}
if args.id_out: info_to_print["organ_id"] = info["organ_id"]
if args.url: info_to_print["database_url"] = info["database_url"]
if args.name: location_to_print["name"] = info["location"]["name"]
if args.type: location_to_print["type"] = info["location"]["type"]
if args.address: location_to_print["address"] = info["location"]["address"]
if args.city: location_to_print["city"] = info["location"]["city"]
if args.state: location_to_print["state_province"] = info["location"]["state_province"]
if args.country: location_to_print["country"] = info["location"]["country"]
if args.builder: info_to_print["builder"] = info["builder"]
if args.opus: info_to_print["opus"] = info["opus"]
if args.year: info_to_print["year"] = info["year"]
if args.ranks: info_to_print["ranks"] = info["ranks"]
if args.stops: info_to_print["stops"] = info["stops"]
if args.manuals: info_to_print["manuals"] = info["manuals"]
if args.divisions: info_to_print["divisions"] = info["divisions"]
if args.registers: info_to_print["registers"] = info["registers"]
if args.stoplist: info_to_print["stoplist"] = info["stoplist"]
if args.lowest_pitch: misc_to_print["lowest_pitch"] = info["misc"]["lowest_pitch"]
if args.altered: misc_to_print["altered"] = info["misc"]["altered"]
info_to_print["location"] = location_to_print
info_to_print["misc"] = misc_to_print
if display:
dataPrint(info_to_print)
return info_to_print
if args.max:
print(getMaxID())
elif args.id:
dataPrintInit()
printOrganData(args.id, True)
elif args.random:
dataPrintInit()
maxID = getMaxID()
for _ in range(args.random):
ID = randint(1, maxID)
printOrganData(ID, True)
elif args.rand_with_info:
dataPrintInit()
maxID = getMaxID()
for _ in range(args.rand_with_info):
satisfied = False
while satisfied == False:
proto_satisfied = True
ID = randint(1, maxID)
data = printOrganData(ID, False)
if args.all:
for k in data:
if not data[k]: proto_satisfied = False
for k in data["location"]:
if not data["location"][k]: proto_satisfied = False
for k in data["misc"]:
if not data[misc][k]: proto_satisfied = False
else:
if args.id_out and not data["organ_id"]: proto_satisfied = False
if args.url and not data["database_url"]: proto_satisfied = False
if args.name and not data["location"]["name"]: proto_satisfied = False
if args.type and not data["location"]["type"]: proto_satisfied = False
if args.address and not data["location"]["address"]: proto_satisfied = False
if args.city and not data["location"]["city"]: proto_satisfied = False
if args.state and not data["location"]["state_province"]: proto_satisfied = False
if args.country and not data["location"]["country"]: proto_satisfied = False
if args.builder and not data["builder"]: proto_satisfied = False
if args.opus and not data["opus"]: proto_satisfied = False
if args.year and not data["year"]: proto_satisfied = False
if args.ranks and not data["ranks"]: proto_satisfied = False
if args.stops and not data["stops"]: proto_satisfied = False
if args.manuals and not data["manuals"]: proto_satisfied = False
if args.divisions and not data["divisions"]: proto_satisfied = False
if args.registers and not data["registers"]: proto_satisfied = False
if args.stoplist and not data["stoplist"]: proto_satisfied = False
if args.lowest_pitch and not data["misc"]["lowest_pitch"]: proto_satisfied = False
if args.altered and not data["misc"]["altered"]: proto_satisfied = False
if proto_satisfied:
satisfied = True
dataPrint(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment