Skip to content

Instantly share code, notes, and snippets.

@m1yag1
Last active July 10, 2019 17:07
Show Gist options
  • Save m1yag1/1d6c4608b8b2d6cc3c56cb58c377ab79 to your computer and use it in GitHub Desktop.
Save m1yag1/1d6c4608b8b2d6cc3c56cb58c377ab79 to your computer and use it in GitHub Desktop.
archive_xpath_search
import csv
import os
import urllib
from datetime import datetime
# pip install requests
import requests
HERE = os.path.abspath(os.path.dirname(__file__))
BOOKS = [
{'title': 'Prealgebra', 'cnx_id': 'caa57dab-41c7-455e-bd6f-f443cda5519c'},
{'title': 'Elementary Algebra', 'cnx_id': '0889907c-f0ef-496a-bcb8-2a5bb121717f'},
{'title': 'Intermediate Algebra', 'cnx_id': '02776133-d49d-49cb-bfaa-67c7f61b25a1'},
{'title': 'College Algebra', 'cnx_id': '9b08c294-057f-4201-9f48-5d6ad992740d'},
{'title': 'Algebra and Trigonometry', 'cnx_id': '13ac107a-f15f-49d2-97e8-60ab2e3b519c'},
{'title': 'Precalculus', 'cnx_id': 'fd53eae1-fa23-47c7-bb1b-972349835c3c'},
{'title': 'Calculus Volume 1', 'cnx_id': '8b89d172-2927-466f-8661-01abc7ccdba4'},
{'title': 'Calculus Volume 2', 'cnx_id': '1d39a348-071f-4537-85b6-c98912458c3c'},
{'title': 'Calculus Volume 3', 'cnx_id': 'a31cd793-2162-4e9e-acb5-6e6bbd76a5fa'},
{'title': 'Introductory Statistics', 'cnx_id': '30189442-6998-4686-ac05-ed152b91b9de'},
{'title': 'Introductory Business Statistics',
'cnx_id': 'b56bb9e9-5eb8-48ef-9939-88b1b12ce22f'},
{'title': 'Anatomy and Physiology', 'cnx_id': '14fb4ad7-39a1-4eee-ab6e-3ef2482e3e22'},
{'title': 'Astronomy', 'cnx_id': '2e737be8-ea65-48c3-aa0a-9f35b4c6a966'},
{'title': 'Biology', 'cnx_id': '185cbf87-c72e-48f5-b51e-f14f21b5eabd'},
{'title': 'Biology 2e', 'cnx_id': '8d50a0af-948b-4204-a71d-4826cba765b8'},
{'title': 'Concepts of Biology', 'cnx_id': 'b3c1e1d2-839c-42b0-a314-e119a8aafbdd'},
{'title': 'Microbiology', 'cnx_id': 'e42bd376-624b-4c0f-972f-e0c57998e765'},
{'title': 'Chemistry: Atoms First', 'cnx_id': '4539ae23-1ccc-421e-9b25-843acbb6c4b0'},
{'title': 'University Physics Volume 1', 'cnx_id': 'd50f6e32-0fda-46ef-a362-9bd36ca7c97d'},
{'title': 'University Physics Volume 2', 'cnx_id': '7a0f9770-1c44-4acd-9920-1cd9a99f2a1e'},
{'title': 'University Physics Volume 3', 'cnx_id': 'af275420-6050-4707-995c-57b9cc13c358'},
{'title': 'Biology for AP® Courses', 'cnx_id': '6c322e32-9fb0-4c4d-a1d7-20c95c5c7af2'},
{'title': 'The AP Physics Collection', 'cnx_id': '8d04a686-d5e8-4798-a27d-c608e4d0e187'},
{'title': 'Fizyka dla szkół wyższych. Tom 1',
'cnx_id': '4eaa8f03-88a8-485a-a777-dd3602f6c13e'},
{'title': 'Fizyka dla szkół wyższych. Tom 2',
'cnx_id': '16ab5b96-4598-45f9-993c-b8d78d82b0c6'},
{'title': 'Fizyka dla szkół wyższych. Tom 3',
'cnx_id': 'bb62933e-f20a-4ffc-90aa-97b36c296c3e'},
{'title': 'American Government', 'cnx_id': '5bcc0e59-7345-421d-8507-a1e4608685e8'},
{'title': 'Principles of Economics 2e', 'cnx_id': 'bc498e1f-efe9-43a0-8dea-d3569ad09a82'},
{'title': 'Principles of Macroeconomics 2e',
'cnx_id': '27f59064-990e-48f1-b604-5188b9086c29'},
{'title': 'Principles of Microeconomics 2e',
'cnx_id': '5c09762c-b540-47d3-9541-dda1f44f16e5'},
{'title': 'Psychology', 'cnx_id': '4abf04bf-93a0-45c3-9cbc-2cefd46e68cc'},
{'title': 'Introduction to Sociology 2e',
'cnx_id': '02040312-72c8-441e-a685-20e9333f3e1d'},
{'title': 'Principles of Macroeconomics for AP® Courses 2e',
'cnx_id': '9117cf8c-a8a3-4875-8361-9cb0f1fc9362'},
{'title': 'Principles of Microeconomics for AP® Courses 2e',
'cnx_id': '636cbfd9-4e37-4575-83ab-9dec9029ca4e'},
{'title': 'U.S. History', 'cnx_id': 'a7ba2fb8-8925-4987-b182-5f4429d48daa'},
{'title': 'Introduction to Business', 'cnx_id': '4e09771f-a8aa-40ce-9063-aa58cc24e77f'},
{'title': 'Business Ethics', 'cnx_id': '914ac66e-e1ec-486d-8a9c-97b0f7a99774'},
{'title': 'Principles of Accounting, Volume 2: Managerial Accounting',
'cnx_id': '920d1c8a-606c-4888-bfd4-d1ee27ce1795'},
{'title': 'Principles of Accounting, Volume 1: Financial Accounting',
'cnx_id': '9ab4ba6d-1e48-486d-a2de-38ae1617ca84'},
{'title': 'Principles of Accounting, Volume 1: Financial Accounting',
'cnx_id': '9ab4ba6d-1e48-486d-a2de-38ae1617ca84'}]
def to_csv(fieldnames, collection, filename, datestamp=True):
if datestamp:
filename = f"{filename}-{datetime.now().strftime('%Y%m%d')}.csv"
else:
filename = f"{filename}.csv"
print(f"Saving csv file to {filename}")
with open(filename, 'w') as outfile:
w = csv.DictWriter(outfile, fieldnames, dialect='excel')
w.writeheader()
for row in collection:
w.writerow(row)
def make_destination_folder(folder):
if not os.path.isdir(folder):
os.makedirs(folder)
def do_get_request(url, **kwargs):
params = kwargs
r = requests.get(url, params=params)
r.raise_for_status()
return r.json()
def save_results(output_dir, results):
make_destination_folder(output_dir)
result_path = os.path.join(output_dir, "search_results")
fieldnames = results[0].keys()
to_csv(fieldnames, results, result_path)
def add_additional_metadata(book_title, archive_host, results):
for result in results:
result["book_title"] = book_title
result["archive_host"] = archive_host
result["full_url"] = urllib.parse.urljoin(archive_host, result["uri"])
return results
def do_xpath_search(archive_url, cnx_id, xpath_query, type="html"):
def handle_search_results(response):
"""Handle the results from QA and Staging
Due to QA and staging returning their results in a different format we
need to handle them separately
"""
if "results" in response and response["results"]:
return response["results"]
elif response and "results" not in response:
return response
else:
return None
params = dict(
id=cnx_id,
q=xpath_query,
type=type
)
return handle_search_results(do_get_request(url=archive_url, **params))
if __name__ == "__main__":
archive_host = "https://archive-qa.cnx.org"
xpath_search_url = f"{archive_host}/xpath.json"
output_dir = os.path.join(HERE, "output")
xitems = ["//h:em[not(node())]",
"//h:strong[not(node())]",
"//h:sub[not(node())]",
"//h:sup[not(node())]",
"//h:iframe[not(node())]",
"//h:span[not(node())]",
"//h:h3[not(node())]",
]
q = "|".join([i for i in xitems])
results_data = []
# Change the range here to target different books example. BOOKS[9:10] will
# target Intro to Statistics
for book in BOOKS:
print(f"Searching [{book['title']}] uuid: {book['cnx_id']}")
results = do_xpath_search(archive_url=xpath_search_url,
cnx_id=book["cnx_id"],
xpath_query=q)
if results:
print(f"{len(results)} results found.")
results = add_additional_metadata(book_title=book["title"],
archive_host=archive_host,
results=results)
results_data.extend(results)
else:
print(f"No results found for {book['title']}")
if results_data:
print("Saving all result data")
save_results(output_dir, results_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment