m1yag1/archive_xpath_search.py

## archive_xpath_search.py
import csv
import os
import urllib
from datetime import datetime

# pip install requests
import requests

HERE = os.path.abspath(os.path.dirname(__file__))

BOOKS = [
    {'title': 'Prealgebra', 'cnx_id': 'caa57dab-41c7-455e-bd6f-f443cda5519c'},
    {'title': 'Elementary Algebra', 'cnx_id': '0889907c-f0ef-496a-bcb8-2a5bb121717f'},
    {'title': 'Intermediate Algebra', 'cnx_id': '02776133-d49d-49cb-bfaa-67c7f61b25a1'},
    {'title': 'College Algebra', 'cnx_id': '9b08c294-057f-4201-9f48-5d6ad992740d'},
    {'title': 'Algebra and Trigonometry', 'cnx_id': '13ac107a-f15f-49d2-97e8-60ab2e3b519c'},
    {'title': 'Precalculus', 'cnx_id': 'fd53eae1-fa23-47c7-bb1b-972349835c3c'},
    {'title': 'Calculus Volume 1', 'cnx_id': '8b89d172-2927-466f-8661-01abc7ccdba4'},
    {'title': 'Calculus Volume 2', 'cnx_id': '1d39a348-071f-4537-85b6-c98912458c3c'},
    {'title': 'Calculus Volume 3', 'cnx_id': 'a31cd793-2162-4e9e-acb5-6e6bbd76a5fa'},
    {'title': 'Introductory Statistics', 'cnx_id': '30189442-6998-4686-ac05-ed152b91b9de'},
    {'title': 'Introductory Business Statistics',
     'cnx_id': 'b56bb9e9-5eb8-48ef-9939-88b1b12ce22f'},
    {'title': 'Anatomy and Physiology', 'cnx_id': '14fb4ad7-39a1-4eee-ab6e-3ef2482e3e22'},
    {'title': 'Astronomy', 'cnx_id': '2e737be8-ea65-48c3-aa0a-9f35b4c6a966'},
    {'title': 'Biology', 'cnx_id': '185cbf87-c72e-48f5-b51e-f14f21b5eabd'},
    {'title': 'Biology 2e', 'cnx_id': '8d50a0af-948b-4204-a71d-4826cba765b8'},
    {'title': 'Concepts of Biology', 'cnx_id': 'b3c1e1d2-839c-42b0-a314-e119a8aafbdd'},
    {'title': 'Microbiology', 'cnx_id': 'e42bd376-624b-4c0f-972f-e0c57998e765'},
    {'title': 'Chemistry: Atoms First', 'cnx_id': '4539ae23-1ccc-421e-9b25-843acbb6c4b0'},
    {'title': 'University Physics Volume 1', 'cnx_id': 'd50f6e32-0fda-46ef-a362-9bd36ca7c97d'},
    {'title': 'University Physics Volume 2', 'cnx_id': '7a0f9770-1c44-4acd-9920-1cd9a99f2a1e'},
    {'title': 'University Physics Volume 3', 'cnx_id': 'af275420-6050-4707-995c-57b9cc13c358'},
    {'title': 'Biology for AP® Courses', 'cnx_id': '6c322e32-9fb0-4c4d-a1d7-20c95c5c7af2'},
    {'title': 'The AP Physics Collection', 'cnx_id': '8d04a686-d5e8-4798-a27d-c608e4d0e187'},
    {'title': 'Fizyka dla szkół wyższych. Tom 1',
     'cnx_id': '4eaa8f03-88a8-485a-a777-dd3602f6c13e'},
    {'title': 'Fizyka dla szkół wyższych. Tom 2',
     'cnx_id': '16ab5b96-4598-45f9-993c-b8d78d82b0c6'},
    {'title': 'Fizyka dla szkół wyższych. Tom 3',
     'cnx_id': 'bb62933e-f20a-4ffc-90aa-97b36c296c3e'},
    {'title': 'American Government', 'cnx_id': '5bcc0e59-7345-421d-8507-a1e4608685e8'},
    {'title': 'Principles of Economics 2e', 'cnx_id': 'bc498e1f-efe9-43a0-8dea-d3569ad09a82'},
    {'title': 'Principles of Macroeconomics 2e',
     'cnx_id': '27f59064-990e-48f1-b604-5188b9086c29'},
    {'title': 'Principles of Microeconomics 2e',
     'cnx_id': '5c09762c-b540-47d3-9541-dda1f44f16e5'},
    {'title': 'Psychology', 'cnx_id': '4abf04bf-93a0-45c3-9cbc-2cefd46e68cc'},
    {'title': 'Introduction to Sociology 2e',
     'cnx_id': '02040312-72c8-441e-a685-20e9333f3e1d'},
    {'title': 'Principles of Macroeconomics for AP® Courses 2e',
     'cnx_id': '9117cf8c-a8a3-4875-8361-9cb0f1fc9362'},
    {'title': 'Principles of Microeconomics for AP® Courses 2e',
     'cnx_id': '636cbfd9-4e37-4575-83ab-9dec9029ca4e'},
    {'title': 'U.S. History', 'cnx_id': 'a7ba2fb8-8925-4987-b182-5f4429d48daa'},
    {'title': 'Introduction to Business', 'cnx_id': '4e09771f-a8aa-40ce-9063-aa58cc24e77f'},
    {'title': 'Business Ethics', 'cnx_id': '914ac66e-e1ec-486d-8a9c-97b0f7a99774'},
    {'title': 'Principles of Accounting, Volume 2: Managerial Accounting',
     'cnx_id': '920d1c8a-606c-4888-bfd4-d1ee27ce1795'},
    {'title': 'Principles of Accounting, Volume 1: Financial Accounting',
     'cnx_id': '9ab4ba6d-1e48-486d-a2de-38ae1617ca84'},
    {'title': 'Principles of Accounting, Volume 1: Financial Accounting',
     'cnx_id': '9ab4ba6d-1e48-486d-a2de-38ae1617ca84'}]


def to_csv(fieldnames, collection, filename, datestamp=True):
    if datestamp:
        filename = f"{filename}-{datetime.now().strftime('%Y%m%d')}.csv"
    else:
        filename = f"{filename}.csv"

    print(f"Saving csv file to {filename}")

    with open(filename, 'w') as outfile:
        w = csv.DictWriter(outfile, fieldnames, dialect='excel')
        w.writeheader()

        for row in collection:
            w.writerow(row)


def make_destination_folder(folder):
    if not os.path.isdir(folder):
        os.makedirs(folder)


def do_get_request(url, **kwargs):
    params = kwargs

    r = requests.get(url, params=params)
    r.raise_for_status()
    return r.json()


def save_results(output_dir, results):
    make_destination_folder(output_dir)
    result_path = os.path.join(output_dir, "search_results")

    fieldnames = results[0].keys()
    to_csv(fieldnames, results, result_path)


def add_additional_metadata(book_title, archive_host, results):
    for result in results:
        result["book_title"] = book_title
        result["archive_host"] = archive_host
        result["full_url"] = urllib.parse.urljoin(archive_host, result["uri"])

    return results


def do_xpath_search(archive_url, cnx_id, xpath_query, type="html"):
    def handle_search_results(response):
        """Handle the results from QA and Staging

        Due to QA and staging returning their results in a different format we
        need to handle them separately
        """

        if "results" in response and response["results"]:
            return response["results"]
        elif response and "results" not in response:
            return response
        else:
            return None

    params = dict(
        id=cnx_id,
        q=xpath_query,
        type=type
    )

    return handle_search_results(do_get_request(url=archive_url, **params))


if __name__ == "__main__":
    archive_host = "https://archive-qa.cnx.org"
    xpath_search_url = f"{archive_host}/xpath.json"

    output_dir = os.path.join(HERE, "output")

    xitems = ["//h:em[not(node())]",
              "//h:strong[not(node())]",
              "//h:sub[not(node())]",
              "//h:sup[not(node())]",
              "//h:iframe[not(node())]",
              "//h:span[not(node())]",
              "//h:h3[not(node())]",
              ]

    q = "|".join([i for i in xitems])

    results_data = []

    # Change the range here to target different books example. BOOKS[9:10] will
    # target Intro to Statistics
    for book in BOOKS:
        print(f"Searching [{book['title']}] uuid: {book['cnx_id']}")
        results = do_xpath_search(archive_url=xpath_search_url,
                                  cnx_id=book["cnx_id"],
                                  xpath_query=q)
        if results:
            print(f"{len(results)} results found.")

            results = add_additional_metadata(book_title=book["title"],
                                              archive_host=archive_host,
                                              results=results)

            results_data.extend(results)

        else:
            print(f"No results found for {book['title']}")

    if results_data:
        print("Saving all result data")
        save_results(output_dir, results_data)
	import csv
	import os
	import urllib
	from datetime import datetime

	# pip install requests
	import requests

	HERE = os.path.abspath(os.path.dirname(__file__))

	BOOKS = [
	{'title': 'Prealgebra', 'cnx_id': 'caa57dab-41c7-455e-bd6f-f443cda5519c'},
	{'title': 'Elementary Algebra', 'cnx_id': '0889907c-f0ef-496a-bcb8-2a5bb121717f'},
	{'title': 'Intermediate Algebra', 'cnx_id': '02776133-d49d-49cb-bfaa-67c7f61b25a1'},
	{'title': 'College Algebra', 'cnx_id': '9b08c294-057f-4201-9f48-5d6ad992740d'},
	{'title': 'Algebra and Trigonometry', 'cnx_id': '13ac107a-f15f-49d2-97e8-60ab2e3b519c'},
	{'title': 'Precalculus', 'cnx_id': 'fd53eae1-fa23-47c7-bb1b-972349835c3c'},
	{'title': 'Calculus Volume 1', 'cnx_id': '8b89d172-2927-466f-8661-01abc7ccdba4'},
	{'title': 'Calculus Volume 2', 'cnx_id': '1d39a348-071f-4537-85b6-c98912458c3c'},
	{'title': 'Calculus Volume 3', 'cnx_id': 'a31cd793-2162-4e9e-acb5-6e6bbd76a5fa'},
	{'title': 'Introductory Statistics', 'cnx_id': '30189442-6998-4686-ac05-ed152b91b9de'},
	{'title': 'Introductory Business Statistics',
	'cnx_id': 'b56bb9e9-5eb8-48ef-9939-88b1b12ce22f'},
	{'title': 'Anatomy and Physiology', 'cnx_id': '14fb4ad7-39a1-4eee-ab6e-3ef2482e3e22'},
	{'title': 'Astronomy', 'cnx_id': '2e737be8-ea65-48c3-aa0a-9f35b4c6a966'},
	{'title': 'Biology', 'cnx_id': '185cbf87-c72e-48f5-b51e-f14f21b5eabd'},
	{'title': 'Biology 2e', 'cnx_id': '8d50a0af-948b-4204-a71d-4826cba765b8'},
	{'title': 'Concepts of Biology', 'cnx_id': 'b3c1e1d2-839c-42b0-a314-e119a8aafbdd'},
	{'title': 'Microbiology', 'cnx_id': 'e42bd376-624b-4c0f-972f-e0c57998e765'},
	{'title': 'Chemistry: Atoms First', 'cnx_id': '4539ae23-1ccc-421e-9b25-843acbb6c4b0'},
	{'title': 'University Physics Volume 1', 'cnx_id': 'd50f6e32-0fda-46ef-a362-9bd36ca7c97d'},
	{'title': 'University Physics Volume 2', 'cnx_id': '7a0f9770-1c44-4acd-9920-1cd9a99f2a1e'},
	{'title': 'University Physics Volume 3', 'cnx_id': 'af275420-6050-4707-995c-57b9cc13c358'},
	{'title': 'Biology for AP® Courses', 'cnx_id': '6c322e32-9fb0-4c4d-a1d7-20c95c5c7af2'},
	{'title': 'The AP Physics Collection', 'cnx_id': '8d04a686-d5e8-4798-a27d-c608e4d0e187'},
	{'title': 'Fizyka dla szkół wyższych. Tom 1',
	'cnx_id': '4eaa8f03-88a8-485a-a777-dd3602f6c13e'},
	{'title': 'Fizyka dla szkół wyższych. Tom 2',
	'cnx_id': '16ab5b96-4598-45f9-993c-b8d78d82b0c6'},
	{'title': 'Fizyka dla szkół wyższych. Tom 3',
	'cnx_id': 'bb62933e-f20a-4ffc-90aa-97b36c296c3e'},
	{'title': 'American Government', 'cnx_id': '5bcc0e59-7345-421d-8507-a1e4608685e8'},
	{'title': 'Principles of Economics 2e', 'cnx_id': 'bc498e1f-efe9-43a0-8dea-d3569ad09a82'},
	{'title': 'Principles of Macroeconomics 2e',
	'cnx_id': '27f59064-990e-48f1-b604-5188b9086c29'},
	{'title': 'Principles of Microeconomics 2e',
	'cnx_id': '5c09762c-b540-47d3-9541-dda1f44f16e5'},
	{'title': 'Psychology', 'cnx_id': '4abf04bf-93a0-45c3-9cbc-2cefd46e68cc'},
	{'title': 'Introduction to Sociology 2e',
	'cnx_id': '02040312-72c8-441e-a685-20e9333f3e1d'},
	{'title': 'Principles of Macroeconomics for AP® Courses 2e',
	'cnx_id': '9117cf8c-a8a3-4875-8361-9cb0f1fc9362'},
	{'title': 'Principles of Microeconomics for AP® Courses 2e',
	'cnx_id': '636cbfd9-4e37-4575-83ab-9dec9029ca4e'},
	{'title': 'U.S. History', 'cnx_id': 'a7ba2fb8-8925-4987-b182-5f4429d48daa'},
	{'title': 'Introduction to Business', 'cnx_id': '4e09771f-a8aa-40ce-9063-aa58cc24e77f'},
	{'title': 'Business Ethics', 'cnx_id': '914ac66e-e1ec-486d-8a9c-97b0f7a99774'},
	{'title': 'Principles of Accounting, Volume 2: Managerial Accounting',
	'cnx_id': '920d1c8a-606c-4888-bfd4-d1ee27ce1795'},
	{'title': 'Principles of Accounting, Volume 1: Financial Accounting',
	'cnx_id': '9ab4ba6d-1e48-486d-a2de-38ae1617ca84'},
	{'title': 'Principles of Accounting, Volume 1: Financial Accounting',
	'cnx_id': '9ab4ba6d-1e48-486d-a2de-38ae1617ca84'}]


	def to_csv(fieldnames, collection, filename, datestamp=True):
	if datestamp:
	filename = f"{filename}-{datetime.now().strftime('%Y%m%d')}.csv"
	else:
	filename = f"{filename}.csv"

	print(f"Saving csv file to {filename}")

	with open(filename, 'w') as outfile:
	w = csv.DictWriter(outfile, fieldnames, dialect='excel')
	w.writeheader()

	for row in collection:
	w.writerow(row)


	def make_destination_folder(folder):
	if not os.path.isdir(folder):
	os.makedirs(folder)


	def do_get_request(url, **kwargs):
	params = kwargs

	r = requests.get(url, params=params)
	r.raise_for_status()
	return r.json()


	def save_results(output_dir, results):
	make_destination_folder(output_dir)
	result_path = os.path.join(output_dir, "search_results")

	fieldnames = results[0].keys()
	to_csv(fieldnames, results, result_path)


	def add_additional_metadata(book_title, archive_host, results):
	for result in results:
	result["book_title"] = book_title
	result["archive_host"] = archive_host
	result["full_url"] = urllib.parse.urljoin(archive_host, result["uri"])

	return results


	def do_xpath_search(archive_url, cnx_id, xpath_query, type="html"):
	def handle_search_results(response):
	"""Handle the results from QA and Staging

	Due to QA and staging returning their results in a different format we
	need to handle them separately
	"""

	if "results" in response and response["results"]:
	return response["results"]
	elif response and "results" not in response:
	return response
	else:
	return None

	params = dict(
	id=cnx_id,
	q=xpath_query,
	type=type
	)

	return handle_search_results(do_get_request(url=archive_url, **params))


	if __name__ == "__main__":
	archive_host = "https://archive-qa.cnx.org"
	xpath_search_url = f"{archive_host}/xpath.json"

	output_dir = os.path.join(HERE, "output")

	xitems = ["//h:em[not(node())]",
	"//h:strong[not(node())]",
	"//h:sub[not(node())]",
	"//h:sup[not(node())]",
	"//h:iframe[not(node())]",
	"//h:span[not(node())]",
	"//h:h3[not(node())]",
	]

	q = "\|".join([i for i in xitems])

	results_data = []

	# Change the range here to target different books example. BOOKS[9:10] will
	# target Intro to Statistics
	for book in BOOKS:
	print(f"Searching [{book['title']}] uuid: {book['cnx_id']}")
	results = do_xpath_search(archive_url=xpath_search_url,
	cnx_id=book["cnx_id"],
	xpath_query=q)
	if results:
	print(f"{len(results)} results found.")

	results = add_additional_metadata(book_title=book["title"],
	archive_host=archive_host,
	results=results)

	results_data.extend(results)

	else:
	print(f"No results found for {book['title']}")

	if results_data:
	print("Saving all result data")
	save_results(output_dir, results_data)