Skip to content

Instantly share code, notes, and snippets.

@ggorlen
Created July 29, 2018 01:19
Show Gist options
  • Save ggorlen/5611f8550a8514305cacd3e1172ff69b to your computer and use it in GitHub Desktop.
Save ggorlen/5611f8550a8514305cacd3e1172ff69b to your computer and use it in GitHub Desktop.
"""
Script to download "shelf" lists from the SFPL Bibliocommons website
http://stackoverflow.com/questions/189555/how-to-use-python-to-login-to-a-webpage-and-retrieve-cookies-for-later-usage
http://stackoverflow.com/questions/14630288/unicodeencodeerror-charmap-codec-cant-encode-character-maps-to-undefined
"""
import datetime
import sys
from bs4 import BeautifulSoup
from requests import session
try:
# should be a valid URL to the shelf
base_url = sys.argv[1] + "?page="
except IndexError:
base_url = "https://sfpl.bibliocommons.com/collection/show/378235992/library/for_later?page="
output_filename = "bibliocommons_shelf_" + str(datetime.datetime.now().date()) + ".txt"
shelf_contents = ""
# attempts to fix UnicodeEncodeError: character maps to <undefined>
def fix_enc(s):
return str(s).encode(sys.stdout.encoding, errors='backslashreplace').decode(sys.stdout.encoding)
# start a new requests session
with session() as c:
# index to store the current page on the website
page_num = 1
# keep checking through pages until we run out
while True:
# navigate to the next page in the shelf
response = c.get(base_url + str(page_num))
if "200" not in str(response):
print(response)
sys.exit(1)
# move on to the next page
page_num += 1
# make a soup object with the response text
soup = BeautifulSoup(response.text, 'html.parser')
# find title/author/format information
titles = soup.find_all("span", class_="title")
formats = soup.find_all("span", class_="format")
assert len(titles) == len(formats)
# save the first title from the results
current_title = str(titles)
# stop checking pages if current title is same as
# the last, meaning we're at the end of the list
try:
if current_title == last_title:
break
except NameError:
pass
# iterate over titles size
for n in range(len(titles)):
try:
# print title and store in a string for writing
shelf_contents += titles[n].get_text().strip() + "\n"
print(titles[n].get_text().strip())
# idx for author search
i = 0
# check to see if there's an author for this title
for e in titles[n].next_elements:
if "class=\"author" in str(e):
author = e.get_text().strip()
shelf_contents += author + "\n"
print(author)
break
# increment counter and break if search failed
i += 1
if i >= 7:
break
# append and print format
shelf_contents += formats[n].get_text()\
.replace("\n", "").strip() + "\n"
print(formats[n].get_text().replace("\n", "").strip())
except (IndexError, UnicodeEncodeError) as e:
pass
# add extra carriage return
shelf_contents += "\n"
print()
# make the current title the last title
last_title = current_title
# write to file
with open(output_filename, 'w') as f:
f.write(fix_enc(shelf_contents))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment