Skip to content

Instantly share code, notes, and snippets.

@Zeta611
Last active February 15, 2023 04:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Zeta611/e774e7e0c17824752ee9988708f94dec to your computer and use it in GitHub Desktop.
Save Zeta611/e774e7e0c17824752ee9988708f94dec to your computer and use it in GitHub Desktop.
[Real World Web Scraping] I used this script to find and parse missing ISBNs to help a librarian. Fixed 700+ books, took 3 hours to write. #automation #demo
from bs4 import BeautifulSoup
import re
import requests
import webbrowser
registrationNumbers = [
"HHA000010715",
"HHA000010711",
"HHA000001827",
"HHA000008672",
"HHA000008410",
"HHA000008403",
"HHA000005292",
"HHA000008701",
"HHA000009694",
"HHA000010286",
"HHA000010288",
"HHA000010290",
"HHA000010292",
"HHA000010294",
"HHA000010296",
"HHA000000172",
"HHA000001334",
"HHA000010018",
"HHA000001267",
"HHA000000562",
"HHA000002775",
"HHA000010699",
"HHA000010701",
"HHA000010703",
"HHA000010705",
"HHA000010707",
"HHA000010709",
"HHA000010745",
"HHA000010743",
"HHA000010741",
"HHA000010739",
"HHA000010737",
"HHA000010735",
"HHA000010731",
"HHA000010266",
"HHA000010268",
"HHA000010270",
"HHA000010272",
"HHA000010274",
"HHA000010276",
"HHA000010278",
"HHA000010280",
"HHA000010282",
"HHA000010284",
"HHA000009594",
"HHA000010729",
"HHA000001652",
"HHA000010733",
"HHA000011537",
"HHA000011535",
"HHA000011533",
"HHA000011531",
"HHA000011529",
"HHA000011527",
"HHA000006438",
"HHA000001637",
"HHA000002601",
"HHA000006441",
"HHA000011497",
"HHA000003616",
"HHA000000066",
"HHA000000560",
"HHA000008111",
"HHA000001389",
"HHA000008337",
"HHA000008336",
"HHA000008334",
"HHA000002631",
"HHA000008333",
"HHA000008331",
"HHA000008326",
"HHA000005213",
"HHA000011849",
"HHA000011850",
"HHA000011851",
"HHA000011852",
"HHA000011853",
"HHA000011854",
"HHA000011855",
"HHA000011856",
"HHA000001138",
"HHA000012794",
"HHA000010022",
"HHA000010024",
"HHA000000644",
"HHA000010028",
"HHA000010030",
"HHA000010032",
"HHA000010034",
"HHA000010036",
"HHA000005391",
"HHA000010042",
"HHA000010040",
"HHA000000434",
"HHA000010038",
"HHA000001660",
"HHA000010717",
"HHA000010719",
"HHA000010721",
"HHA000010723",
"HHA000010026",
"HHA000010727",
"HHA000006440",
"HHA000011495",
"HHA000011493",
"HHA000010020",
"HHA000001720",
"HHA000010016",
"HHA000010014",
"HHA000010012",
"HHA000000617",
"HHA000010010",
"HHA000010008",
"HHA000010006",
"HHA000010004",
"HHA000001305",
"HHA000001297",
"HHA000010795",
"HHA000001695",
"HHA000000564",
"HHA000012701",
"HHA000000131",
"HHA000002721",
"HHA000012726",
"HHA000009463",
"HHA000010725",
"HHA000010713",
"HHA000011472",
"HHA000011463",
"HHA000011460",
"HHA000011457",
"HHA000011454",
"HHA000011451",
"HHA000011433",
"HHA000011481",
"HHA000011484",
"HHA000011487",
"HHA000011490",
"HHA000011436",
"HHA000011466",
"HHA000010150",
"HHA000010147",
"HHA000012791",
"HHA000010141",
"HHA000010144",
"HHA000011469",
"HHA000010117",
"HHA000011507",
"HHA000011503",
"HHA000011515",
"HHA000011519",
"HHA000011523",
"HHA000010121",
"HHA000010125",
"HHA000010129",
"HHA000010133",
"HHA000010137",
"HHA000010113",
"HHA000011511",
"HHA000010109",
"HHA000010105",
"HHA000011499",
"HHA000010609",
"HHA000010614",
"HHA000010619",
"HHA000010624",
"HHA000010629",
"HHA000010634",
"HHA000012845",
"HHA000011553",
"HHA000009223",
"HHA000011547",
"HHA000011427",
"HHA000011421",
"HHA000011415",
"HHA000011409",
"HHA000011403",
"HHA000011391",
"HHA000009858",
"HHA000011379",
"HHA000011373",
"HHA000011367",
"HHA000011361",
"HHA000011349",
"HHA000011343",
"HHA000011337",
"HHA000010153",
"HHA000010159",
"HHA000011319",
"HHA000011313",
"HHA000011307",
"HHA000011301",
"HHA000010907",
"HHA000011289",
"HHA000011283",
"HHA000011277",
"HHA000011271",
"HHA000011265",
"HHA000011259",
"HHA000010955",
"HHA000010961",
"HHA000010967",
"HHA000010973",
"HHA000010979",
"HHA000010985",
"HHA000010991",
"HHA000010997",
"HHA000011003",
"HHA000011009",
"HHA000011015",
"HHA000011021",
"HHA000011151",
"HHA000011157",
"HHA000011163",
"HHA000011169",
"HHA000011175",
"HHA000011181",
"HHA000011187",
"HHA000011193",
"HHA000011199",
"HHA000011205",
"HHA000011211",
"HHA000011217",
"HHA000011223",
"HHA000011229",
"HHA000011235",
"HHA000011241",
"HHA000011247",
"HHA000011253",
"HHA000011295",
"HHA000011385",
"HHA000011331",
"HHA000011325",
"HHA000010948",
"HHA000010913",
"HHA000010920",
"HHA000010900",
"HHA000010893",
"HHA000010886",
"HHA000010934",
"HHA000010927",
"HHA000010941",
"HHA000009699",
"HHA000014201",
"HHA000013941",
"HHA000013971",
"HHA000013961",
"HHA000013951",
"HHA000014037",
"HHA000014027",
"HHA000009387",
"HHA000017006",
"HHA000016994",
"HHA000016982",
"HHA000016970",
"HHA000016958",
"HHA000016946",
"HHA000009859",
]
class ScrapeException(Exception):
pass
class ISBNException(Exception):
pass
# Search
def searchURLForRegistrationNumber(registrationNumber):
return f"https://www.l4d.or.kr/yelc/menu/10441/program/30011/plusSearchResultList.do?searchType=DETAIL&searchCategory=ALL&searchKey1=TITLE&searchKeyword1=&searchOperator1=AND&searchKey2=AUTHOR&searchKeyword2=&searchOperator2=AND&searchKey3=PUBLISHER&searchKeyword3=&searchOperator3=AND&searchKey4=KEYWORD&searchKeyword4=&searchOperator4=AND&searchKey6=REG_NO&searchKeyword6={registrationNumber}&searchOperator6=AND&searchKey5=ISBN&searchKeyword5=&searchOperator5=AND&searchPublishStartYear=&searchPublishEndYear=&searchLibrary=MC&searchRoom=ALL&searchSort=KEY&searchOrder=DESC&searchRecordCount=10"
# Detail URL
def detailURLForBook(record, book):
return f"https://www.l4d.or.kr/yelc/menu/10441/program/30011/plusSearchResultDetail.do?searchType=DETAIL&searchMenuCollectionCategory=&searchCategory=ALL&searchKey=&searchKey1=TITLE&searchKey2=AUTHOR&searchKey3=PUBLISHER&searchKey4=KEYWORD&searchKey5=ISBN&searchKeyword=&searchKeyword1=&searchKeyword2=&searchKeyword3=&searchKeyword4=&searchKeyword5=&searchOperator1=AND&searchOperator2=AND&searchOperator3=AND&searchOperator4=AND&searchOperator5=AND&searchPublishStartYear=&searchPublishEndYear=&searchLibrary=MC&searchLibraryArr=MC&searchRoom=ALL&searchKdc=&searchIsbn=&searchSort=KEY&searchOrder=DESC&searchRecordCount=10&currentPageNo=1&viewStatus=IMAGE&preSearchKey=&preSearchKeyword=&reSearchYn=N&recKey={record}&bookKey={book}&publishFormCode=BO&searchSeparateShelfCode="
# Find record / book
def getRecordAndBookIDs(searchURL):
searchResponse = requests.get(searchURL)
soup = BeautifulSoup(searchResponse.content, "html.parser")
ul_element = soup.find("ul", {"class": "resultList imageType"})
li_elements = ul_element.find_all("li")
if len(li_elements) != 1:
raise ScrapeException(
f"Expected exactly 1 <li> element, found {(len(li_elements))}"
)
span_element = li_elements[0].find("span", {"class": "chk"})
input_element = span_element.find("input")
data = input_element["value"]
return tuple(data.split("^")[:2])
def parseDetail(detailURL):
searchResponse = requests.get(detailURL)
soup = BeautifulSoup(searchResponse.content, "html.parser")
h4_element = soup.find("h4")
title = h4_element.text.rstrip("/")
try:
detail_th_element = soup.find("th", {"scope": "row"}, string="주기사항")
detail_td_element = detail_th_element.find_next_sibling("td")
detail = detail_td_element.text
except AttributeError:
detail = None
isbn_regex = r"ISBN:?\s*(?P<isbn>\d*)(?P<extra>[^:\s]+)?"
try:
isbns_th_element = soup.find("th", {"scope": "row"}, string="표준번호")
isbns_td_element = isbns_th_element.find_next_sibling("td")
isbns_text = isbns_td_element.text
isbns = []
for m in re.finditer(isbn_regex, isbns_text):
if m["isbn"] is None:
raise ISBNException(f"ISBN not found in {isbns_text}")
isbns.append((m["isbn"], m["extra"]))
except AttributeError:
isbns = []
individuals = []
individuals_table = soup.find("table", {"class": "tbl"})
individuals_tbody = individuals_table.find("tbody")
for individual_tr in individuals_tbody.find_all("tr"):
individual_td = individual_tr.find_all("td")[3]
individuals.append(individual_td.string)
return (title, detail, isbns, individuals)
with open("result.txt", "w") as fresult:
with open("log.txt", "w") as flog:
fresult.write("Title\tRegistration Number\tISBN\tExtra\tDetail\tURL\n")
flog.write("Registration Number\tURL\tTitle\tReason\n")
total = len(registrationNumbers)
for i, n in enumerate(registrationNumbers):
if i % 10 == 0:
print(f"Progress: [{i}/{total}] ({i / total * 100:.2f}%)")
url = searchURLForRegistrationNumber(n)
try:
record, book = getRecordAndBookIDs(url)
except ScrapeException as e:
webbrowser.open_new_tab(url)
flog.write(
f"{n}\t{url}\t???\tSearch returned zero or more than one results\n"
)
continue
except Exception as e:
webbrowser.open_new_tab(url)
flog.write(f"{n}\t{url}\t???\tUnknown error {e}\n")
continue
detailURL = detailURLForBook(record, book)
try:
title, detail, isbns, individuals = parseDetail(detailURL)
except ISBNException:
webbrowser.open_new_tab(detailURL)
flog.write(f"{n}\t{detailURL}\t{title}\tISBN not found\n")
continue
except Exception as e:
webbrowser.open_new_tab(detailURL)
flog.write(f"{n}\t{detailURL}\t{title}\tUnknown error {e}\n")
continue
if not individuals:
webbrowser.open_new_tab(detailURL)
flog.write(
f"{n}\t{detailURL}\t{title}\tIndividual registration numbers missing\n"
)
elif len(isbns) != len(individuals):
webbrowser.open_new_tab(detailURL)
flog.write(f"{n}\t{detailURL}\t{title}\tFull ISBNs are absent\n")
else:
if len(individuals) == 1:
webbrowser.open_new_tab(detailURL)
flog.write(
f"{n}\t{detailURL}\t{title}\tThere is only one individual book here. Please check!\n"
)
for i, registrationNumber in enumerate(individuals):
isbn, extra = isbns[i]
fresult.write(
f"{title}\t{registrationNumber}\t{isbn}\t{extra if extra is not None else ''}\t{detail if detail is not None else ''}\t{detailURL}\n"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment