Created
January 4, 2021 17:56
-
-
Save SYZYGY-DEV333/56ae34f08c6e3c0e8018d5e80ec3d80d to your computer and use it in GitHub Desktop.
A really crude scraper for pipeorgandatabase.org
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Pipe Organ Database Stoplist Scraper | |
license = ''' | |
Copyright 2021 Joshua Sobel | |
Permission is hereby granted, free of charge, to any person obtaining a copy of this software | |
and associated documentation files (the "Software"), to deal in the Software without restriction, | |
including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, | |
subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all copies or substantial | |
portions of the Software. | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT | |
LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN | |
NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
''' | |
import argparse | |
import sys | |
from lxml import html | |
import html as h | |
import requests | |
import yaml | |
from random import randint | |
### Tell the program to do some stuff | |
parser = argparse.ArgumentParser(description='Grab data from the OHS Pipe Organ Database.') | |
parser.add_argument("--max", action='store_true', help="Display the maximum ID number") | |
parser.add_argument("-i", "--id", type=int, help="Get organ information about the organ with this ID.") | |
parser.add_argument("-r", "--random", type=int, help="Get information about N random organs.") | |
parser.add_argument("-f", "--rand-with-info", type=int, help="Get information about N random organs; only choose organs with specified data fields satisfied.") | |
parser.add_argument("-a", "--all", action='store_true', help="Output all available info for this organ.") | |
parser.add_argument("-c", "--csv", action='store_true', help="Display output info in CSV format") | |
parser.add_argument("--id-out", action='store_true', help="Output the ID for this organ.") | |
parser.add_argument("--url", action='store_true', help="Output the URL for this organ.") | |
parser.add_argument("--name", action='store_true', help="Output the location name for this organ.") | |
parser.add_argument("--type", action='store_true', help="Output the location type for this organ.") | |
parser.add_argument("--address", action='store_true', help="Output the location address for this organ.") | |
parser.add_argument("--city", action='store_true', help="Output the city name for this organ.") | |
parser.add_argument("--state", action='store_true', help="Output the state/province this organ is in") | |
parser.add_argument("--country", action='store_true', help="Output the country this organ is in.") | |
parser.add_argument("--builder", action='store_true', help="Output the builder info for this organ.") | |
parser.add_argument("--opus", action='store_true', help="Output the opus number for this organ.") | |
parser.add_argument("--year", action='store_true', help="Output the build year for this organ.") | |
parser.add_argument("--ranks", action='store_true', help="Output the number of ranks for this organ.") | |
parser.add_argument("--stops", action='store_true', help="Output the number of stops for this organ.") | |
parser.add_argument("--manuals", action='store_true', help="Output the number of manuals for this organ.") | |
parser.add_argument("--divisions", action='store_true', help="Output the number of divisions for this organ.") | |
parser.add_argument("--registers", action='store_true', help="Output the number of registers for this organ.") | |
parser.add_argument("--stoplist", action='store_true', help="Output the stoplist for this organ.") | |
parser.add_argument("--lowest-pitch", action='store_true', help="Output the lowest pitched stop of this organ.") | |
parser.add_argument("--altered", action='store_true', help="Display whether this organ has been altered") | |
args = parser.parse_args() | |
def getOrganData(organ_id): | |
# Initialize data structure | |
organ_data = { | |
"organ_id": None, # Number | |
"database_url": None, # String | |
"location": { | |
"name": None, # String | |
"type": None, # String, but all churches are "church" | |
"address": None, # String | |
"city": None, # String | |
"state_province": None, # String | |
"country": None, # String | |
}, | |
"builder": None, # String | |
"opus": None, # String | |
"year": None, # String | |
"ranks": None, # Number | |
"stops": None, # Number | |
"manuals": None, # Number | |
"divisions": None, # Number | |
"registers": None, # Number | |
"stoplist": None, # Formatted String | |
"misc": { | |
"lowest_pitch": None, # Number | |
"altered": "False", # Boolean-string | |
} | |
} | |
### Get webpage and build element tree ### | |
page = requests.get('https://pipeorgandatabase.org/organ/' + organ_id) | |
tree = html.fromstring(page.content) | |
# organ_id | |
organ_data["organ_id"] = int(organ_id) | |
# database_url | |
organ_data["database_url"] = 'https://pipeorgandatabase.org/organ/' + organ_id | |
# location | |
loc_raw = list(map((lambda i: ' '.join(i.strip().split())), tree.xpath('//p[@class="card-text"]/text()')[:3])) | |
try: | |
if len(loc_raw[-1:][0]) == 0: | |
loc_raw = loc_raw[:-1] | |
elif not ',' in loc_raw[-1:][0]: | |
loc_raw = loc_raw[:-1] | |
except: pass | |
# name | |
try: organ_data["location"]["name"] = loc_raw[0] | |
except: pass | |
# type | |
if "location type" in str(page.content): | |
loc_type_raw = h.unescape(' '.join(str(page.content).split("location type is: ")[1].split("</li>")[0].split("\\n")[0].split())) | |
if 'Church' in loc_type_raw: # Comment this stuff out if you want to know the particular type of church | |
organ_data["location"]["type"] = "Church" # | | |
else: # | | |
organ_data["location"]["type"] = loc_type_raw # <-- | |
# address | |
if len(loc_raw) == 3: | |
organ_data["location"]["address"] = loc_raw[1] | |
# city | |
try: organ_data["location"]["city"] = loc_raw[-1:][0].split(",")[0] | |
except: pass | |
# state_province | |
try: organ_data["location"]["state_province"] = loc_raw[-1:][0].split(", ")[1].split(" ")[0] | |
except: pass | |
# country | |
try: organ_data["location"]["country"] = loc_raw[-1:][0].split(" ")[-1:][0] | |
except: pass | |
# builder | |
if 'a href="/builder/' in str(page.content): | |
raw = tree.xpath('//a[@class="organ-title text-dark"]/text()')[0].strip() | |
organ_data["builder"] = h.unescape(raw.split(" (")[0]) | |
# opus | |
if 'Opus' in raw: | |
organ_data["opus"] = h.unescape(raw.split("Opus ")[1].split(",")[0]) | |
# year | |
try: | |
organ_data["year"] = h.unescape(raw.split(")")[0].split("Opus ")[1].split(", ")[1]) | |
except: pass | |
elif '(' in raw: | |
organ_data["year"] = h.unescape(raw.split("(")[1].split(")")[0]) | |
# ranks | |
try: | |
details_tree = str(page.content).split("Technical Details:</h4>")[1].split("</div>")[0].rsplit("<li>", 1)[1].split("</li>", 1)[0] | |
details = ' '.join(details_tree.split()).replace("\\n", "").split(".")[:-1] | |
except: details = [] | |
details_bank = {} | |
for s in details: | |
k = s.strip().split(" ")[1] | |
n = s.strip().split(" ")[0] | |
details_bank[k] = n | |
if "ranks" in details_bank: organ_data["ranks"] = int(details_bank["ranks"]) | |
# stops | |
if "stops" in details_bank: organ_data["stops"] = int(details_bank["stops"]) | |
# manuals | |
if "manuals" in details_bank: organ_data["manuals"] = int(details_bank["manuals"]) | |
# divisions | |
if "divisions" in details_bank: organ_data["divisions"] = int(details_bank["divisions"]) | |
# registers | |
if "registers" in details_bank: organ_data["registers"] = int(details_bank["registers"]) | |
# stoplist | |
if 'a href="/stoplist/' in str(page.content): | |
stoplist_number = str(page.content).split('href="/stoplist/')[1].split('"')[0] | |
stoplist_page = requests.get("https://pipeorgandatabase.org/stoplist/" + stoplist_number) | |
try: | |
stoplist_tree = html.fromstring(stoplist_page.content) | |
organ_data["stoplist"] = ("\n" + h.unescape(stoplist_tree.xpath('//pre/text()')[0]).replace("\u2019", "'") + "\n") | |
except: pass | |
# lowest_pitch | |
if organ_data["stoplist"]: | |
if "64'" in organ_data["stoplist"]: organ_data["misc"]["lowest_pitch"] = 64 | |
elif "32'" in organ_data["stoplist"]: organ_data["misc"]["lowest_pitch"] = 32 | |
elif "16'" in organ_data["stoplist"] or "16" in organ_data["stoplist"]: organ_data["misc"]["lowest_pitch"] = 16 | |
elif "8'" in organ_data["stoplist"] or "8" in organ_data["stoplist"]: organ_data["misc"]["lowest_pitch"] = 8 | |
# altered | |
if '<h5 class="organ-subtitle text-secondary">' and "Originally" in str(page.content): | |
organ_data["misc"]["altered"] = "True" | |
elif "The organ has been altered from its original state." in str(page.content): | |
organ_data["misc"]["altered"] = "True" | |
return organ_data | |
### Helper Function to get the max id for an organ | |
def getMaxID(): | |
list_page = requests.get("https://pipeorgandatabase.org/organs?sort=addedNewest&extant=False&listView=True") | |
first_row = str(list_page.content).split("<tr>", 2)[2].split("</tr>", 1)[0] | |
maxID = int(first_row.split('<a href="/organ/')[1].split('"', 1)[0]) | |
return maxID | |
### Helper Function to flatten dictionary | |
def flatten(d): | |
out = {} | |
for key, val in d.items(): | |
if isinstance(val, dict): | |
val = [val] | |
if isinstance(val, list): | |
for subdict in val: | |
deeper = flatten(subdict).items() | |
out.update({key + '_' + key2: val2 for key2, val2 in deeper}) | |
else: | |
out[key] = val | |
return out | |
### Print in default (YAML) or CSV formats. | |
def dataPrintInit(): | |
if args.all and args.csv: | |
print("organ_id\tdatabase_url\tbuilder\topus\tyear\tranks\tstops\tmanuals\tdivisions\tregisters\tstoplist\tloc_name\tloc_type\tloc_address\tloc_city\tloc_state_province\tloc_country\tmisc_lowest_pitch\tmisc_altered") | |
elif args.csv: | |
columns = [] | |
if args.id_out: columns.append("organ_id") | |
if args.url: columns.append("database_url") | |
if args.builder: columns.append("builder") | |
if args.opus: columns.append("opus") | |
if args.year: columns.append("year") | |
if args.ranks: columns.append("ranks") | |
if args.stops: columns.append("stops") | |
if args.manuals: columns.append("manuals") | |
if args.divisions: columns.append("divisiona") | |
if args.registers: columns.append("registers") | |
if args.stoplist: columns.append("stoplist") | |
if args.name: columns.append("loc_name") | |
if args.type: columns.append("loc_type") | |
if args.address: columns.append("loc_address") | |
if args.city: columns.append("loc_city") | |
if args.state: columns.append("loc_state_province") | |
if args.country: columns.append("loc_country") | |
if args.lowest_pitch: columns.append("misc_lowest_pitch") | |
if args.altered: columns.append("misc_altered") | |
print("\t".join(columns)) | |
def dataPrint(d): | |
if not args.csv: | |
print(yaml.dump(d, width=20000, allow_unicode=True, sort_keys=False, default_flow_style=False).replace("\\n", "\n").replace("\\t", "\t")) | |
else: | |
new_d = flatten(d) | |
proto_lst = [] | |
for k in new_d: | |
item = new_d[k] | |
if isinstance(item, str): item = ('"' + item + '"') | |
proto_lst.append(new_d[k]) | |
print("\t".join(list(map(str, proto_lst)))) | |
### Prints whatever data you tell it to print | |
def printOrganData(org_id, display): | |
info = getOrganData(str(org_id)) | |
if args.all and display: | |
dataPrint(info) | |
return info | |
else: | |
info_to_print = {} | |
location_to_print = {} | |
misc_to_print = {} | |
if args.id_out: info_to_print["organ_id"] = info["organ_id"] | |
if args.url: info_to_print["database_url"] = info["database_url"] | |
if args.name: location_to_print["name"] = info["location"]["name"] | |
if args.type: location_to_print["type"] = info["location"]["type"] | |
if args.address: location_to_print["address"] = info["location"]["address"] | |
if args.city: location_to_print["city"] = info["location"]["city"] | |
if args.state: location_to_print["state_province"] = info["location"]["state_province"] | |
if args.country: location_to_print["country"] = info["location"]["country"] | |
if args.builder: info_to_print["builder"] = info["builder"] | |
if args.opus: info_to_print["opus"] = info["opus"] | |
if args.year: info_to_print["year"] = info["year"] | |
if args.ranks: info_to_print["ranks"] = info["ranks"] | |
if args.stops: info_to_print["stops"] = info["stops"] | |
if args.manuals: info_to_print["manuals"] = info["manuals"] | |
if args.divisions: info_to_print["divisions"] = info["divisions"] | |
if args.registers: info_to_print["registers"] = info["registers"] | |
if args.stoplist: info_to_print["stoplist"] = info["stoplist"] | |
if args.lowest_pitch: misc_to_print["lowest_pitch"] = info["misc"]["lowest_pitch"] | |
if args.altered: misc_to_print["altered"] = info["misc"]["altered"] | |
info_to_print["location"] = location_to_print | |
info_to_print["misc"] = misc_to_print | |
if display: | |
dataPrint(info_to_print) | |
return info_to_print | |
if args.max: | |
print(getMaxID()) | |
elif args.id: | |
dataPrintInit() | |
printOrganData(args.id, True) | |
elif args.random: | |
dataPrintInit() | |
maxID = getMaxID() | |
for _ in range(args.random): | |
ID = randint(1, maxID) | |
printOrganData(ID, True) | |
elif args.rand_with_info: | |
dataPrintInit() | |
maxID = getMaxID() | |
for _ in range(args.rand_with_info): | |
satisfied = False | |
while satisfied == False: | |
proto_satisfied = True | |
ID = randint(1, maxID) | |
data = printOrganData(ID, False) | |
if args.all: | |
for k in data: | |
if not data[k]: proto_satisfied = False | |
for k in data["location"]: | |
if not data["location"][k]: proto_satisfied = False | |
for k in data["misc"]: | |
if not data[misc][k]: proto_satisfied = False | |
else: | |
if args.id_out and not data["organ_id"]: proto_satisfied = False | |
if args.url and not data["database_url"]: proto_satisfied = False | |
if args.name and not data["location"]["name"]: proto_satisfied = False | |
if args.type and not data["location"]["type"]: proto_satisfied = False | |
if args.address and not data["location"]["address"]: proto_satisfied = False | |
if args.city and not data["location"]["city"]: proto_satisfied = False | |
if args.state and not data["location"]["state_province"]: proto_satisfied = False | |
if args.country and not data["location"]["country"]: proto_satisfied = False | |
if args.builder and not data["builder"]: proto_satisfied = False | |
if args.opus and not data["opus"]: proto_satisfied = False | |
if args.year and not data["year"]: proto_satisfied = False | |
if args.ranks and not data["ranks"]: proto_satisfied = False | |
if args.stops and not data["stops"]: proto_satisfied = False | |
if args.manuals and not data["manuals"]: proto_satisfied = False | |
if args.divisions and not data["divisions"]: proto_satisfied = False | |
if args.registers and not data["registers"]: proto_satisfied = False | |
if args.stoplist and not data["stoplist"]: proto_satisfied = False | |
if args.lowest_pitch and not data["misc"]["lowest_pitch"]: proto_satisfied = False | |
if args.altered and not data["misc"]["altered"]: proto_satisfied = False | |
if proto_satisfied: | |
satisfied = True | |
dataPrint(data) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment