christophmeissner/leipzig_allris_paper_tree_extractor.py

## leipzig_allris_paper_tree_extractor.py
import argparse
import logging
import re
from urllib.parse import urljoin

import requests
from lxml import etree, html

logger = logging.getLogger(__name__)


def fetch_document_lineage_data(
    paper_id,
    page_content=None,
    session_id=None,
    base_url="https://ratsinformation.leipzig.de/allris_leipzig_public/",
):
    """
    Fetches the document lineage table as XML string for a specified paper based on the
    paper ID. It supports reusing a session ID and page content to avoid unnecessary
    HTTP requests:

    If `page_content` AND `session_id` are given, the initial HTTP request is skipped.
    If the given `session_id` is invalid (AJAX call fails or returns unusable data), it
    will re-fetch the page HTML document in the second attempt to retrieve data.

    Args:
        paper_id (int): The ID of the paper for which to fetch data.
        page_content (str, optional): The already loaded page text to avoid redundant
                                      requests. Defaults to None.
        session_id (str, optional): The session ID from the loaded page text. Defaults
                                    to None.
        base_url (str, optional): The base URL of the website from which to fetch the
                                  data. Defaults to the Leipzig information system URL.

    Returns:
        str: The document lineage XML data for the paper.
    """
    url = urljoin(base=base_url, url=f"./vo020?VOLFDNR={paper_id}&refresh=false")

    with requests.Session() as session:
        if page_content is None or session_id is None:
            logger.debug(f"Fetching page content and session id from {url}")
            response = session.get(
                url, headers={"DNT": "1", "Accept-Encoding": "gzip, deflate, br"}
            )
            response.raise_for_status()
            page_content = response.text
        else:
            logger.debug(f"Using pre-fetched page content and session id {session_id}")
            session.cookies.set("JSESSIONID", session_id)

        # Regex pattern to find the AJAX URL embedded in the page's JavaScript
        pattern = r'"u"\s*:\s*"([^\"]*?)"'
        match = re.findall(pattern, page_content)
        if not match:
            raise Exception(
                "Could not find document lineage AJAX URL in HTML document."
            )

        # Assuming the relevant AJAX URL is the last match found
        relative_ajax_url = match[-1]
        # Resolve the full AJAX URL relative to the base URL
        document_lineage_url = urljoin(base=url, url=relative_ajax_url)

        headers = {
            "Wicket-Ajax": "true",
            "Wicket-Ajax-BaseURL": f"vo020?VOLFDNR={paper_id}&refresh=false",
            "Accept": "application/xml, text/xml, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate, br",
            "Referer": url,
            "DNT": "1",
        }

        ajax_response = session.get(document_lineage_url, headers=headers)
        if session_id is not None:
            try:
                ajax_response.raise_for_status()  # Ensure the request was successful
                if "Ajax-Location" in ajax_response.headers:
                    raise requests.HTTPError("'Ajax-Location' in headers")
            except requests.HTTPError as http_error:
                logger.debug(
                    f"HTTP error occurred: {http_error}. Retrying with fresh "
                    f"session...",
                    extra=dict(ajax_response.headers),
                    exc_info=True,
                )
                # retry with fresh session
                return fetch_document_lineage_data(
                    paper_id=paper_id,
                    page_content=None,
                    session_id=None,
                    base_url=base_url,
                )
        else:
            ajax_response.raise_for_status()  # Ensure the request was successful

        # Check the response's content type
        content_type = ajax_response.headers.get("Content-Type")
        if not content_type.startswith("text/xml"):
            raise Exception(
                f"Invalid response from {document_lineage_url}. Expected text/xml, "
                f"got {content_type}"
            )

        return ajax_response.text


def parse_page_lineage_xml(xml_data):
    """
    Parses XML data representing a document lineage table, extracting information about
    each document listed.

    Args:
        xml_data (str): The XML data to be parsed.

    Returns:
        list: A list of dictionaries, each containing data extracted from one row of
              the document lineage table.
    """
    tree = etree.fromstring(bytes(xml_data, "utf-8"))
    data_elements = tree.xpath("//component")
    if len(data_elements) == 0:
        raise Exception("Parsing failed. No components found")

    data_element = data_elements[0]
    cdata_content = data_element.text
    document = html.fromstring(cdata_content)

    headers = [
        header.text_content().strip() for header in document.xpath("//thead/tr/th")
    ]
    rows = []

    for row in document.xpath("//tbody/tr"):
        row_data = {}
        for i, td in enumerate(row.findall("td")):
            key = (td.get("class") or headers[i]).replace(" ", "+").lower()
            value = td.text_content().strip()
            row_data[key] = value

            if i == 0:
                a_tag = td.find("*/a")
                href = a_tag.get("href") if a_tag is not None else None
                row_data["ref_url"] = href
                if href:
                    id_match = re.search(r"VOLFDNR=(\d+)", href)
                    row_data["ref_id"] = id_match.group(1) if id_match else None

        rows.append(row_data)
    logger.debug(f"Parsed data: {rows}")
    return rows


if __name__ == "__main__":

    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser(
        description="""\
    Fetches the document lineage table as XML string for a specified paper based on the\
    paper ID. It supports reusing a session ID and page content to avoid unnecessary\
    HTTP requests:\

    If `page_content` AND `session_id` are given, the initial HTTP request is skipped.\
    If the given `session_id` is invalid (AJAX call fails or returns unusable data), it\
    will re-fetch the page HTML document in the second attempt to retrieve data."""
    )
    parser.add_argument(
        "paper_id",
        nargs="?",
        type=int,
        default=2015202,
        help="The paper ID to fetch the AJAX data for. Defaults to 2015202 if not specified.",
    )
    args = parser.parse_args()

    session_id = None
    page_content = None

    document_lineage_data = fetch_document_lineage_data(
        paper_id=args.paper_id, page_content=page_content, session_id=session_id
    )
    logger.debug(
        "Got XML data, parsing...",
        extra={"xml_data": document_lineage_data},
    )
    parsed_data = parse_page_lineage_xml(document_lineage_data)
    logger.debug(
        f"Final parsed data: {parsed_data}",
        extra={"result": parsed_data},
    )
	import argparse
	import logging
	import re
	from urllib.parse import urljoin

	import requests
	from lxml import etree, html

	logger = logging.getLogger(__name__)


	def fetch_document_lineage_data(
	paper_id,
	page_content=None,
	session_id=None,
	base_url="https://ratsinformation.leipzig.de/allris_leipzig_public/",
	):
	"""
	Fetches the document lineage table as XML string for a specified paper based on the
	paper ID. It supports reusing a session ID and page content to avoid unnecessary
	HTTP requests:

	If `page_content` AND `session_id` are given, the initial HTTP request is skipped.
	If the given `session_id` is invalid (AJAX call fails or returns unusable data), it
	will re-fetch the page HTML document in the second attempt to retrieve data.

	Args:
	paper_id (int): The ID of the paper for which to fetch data.
	page_content (str, optional): The already loaded page text to avoid redundant
	requests. Defaults to None.
	session_id (str, optional): The session ID from the loaded page text. Defaults
	to None.
	base_url (str, optional): The base URL of the website from which to fetch the
	data. Defaults to the Leipzig information system URL.

	Returns:
	str: The document lineage XML data for the paper.
	"""
	url = urljoin(base=base_url, url=f"./vo020?VOLFDNR={paper_id}&refresh=false")

	with requests.Session() as session:
	if page_content is None or session_id is None:
	logger.debug(f"Fetching page content and session id from {url}")
	response = session.get(
	url, headers={"DNT": "1", "Accept-Encoding": "gzip, deflate, br"}
	)
	response.raise_for_status()
	page_content = response.text
	else:
	logger.debug(f"Using pre-fetched page content and session id {session_id}")
	session.cookies.set("JSESSIONID", session_id)

	# Regex pattern to find the AJAX URL embedded in the page's JavaScript
	pattern = r'"u"\s:\s"([^\"]*?)"'
	match = re.findall(pattern, page_content)
	if not match:
	raise Exception(
	"Could not find document lineage AJAX URL in HTML document."
	)

	# Assuming the relevant AJAX URL is the last match found
	relative_ajax_url = match[-1]
	# Resolve the full AJAX URL relative to the base URL
	document_lineage_url = urljoin(base=url, url=relative_ajax_url)

	headers = {
	"Wicket-Ajax": "true",
	"Wicket-Ajax-BaseURL": f"vo020?VOLFDNR={paper_id}&refresh=false",
	"Accept": "application/xml, text/xml, /; q=0.01",
	"Accept-Encoding": "gzip, deflate, br",
	"Referer": url,
	"DNT": "1",
	}

	ajax_response = session.get(document_lineage_url, headers=headers)
	if session_id is not None:
	try:
	ajax_response.raise_for_status() # Ensure the request was successful
	if "Ajax-Location" in ajax_response.headers:
	raise requests.HTTPError("'Ajax-Location' in headers")
	except requests.HTTPError as http_error:
	logger.debug(
	f"HTTP error occurred: {http_error}. Retrying with fresh "
	f"session...",
	extra=dict(ajax_response.headers),
	exc_info=True,
	)
	# retry with fresh session
	return fetch_document_lineage_data(
	paper_id=paper_id,
	page_content=None,
	session_id=None,
	base_url=base_url,
	)
	else:
	ajax_response.raise_for_status() # Ensure the request was successful

	# Check the response's content type
	content_type = ajax_response.headers.get("Content-Type")
	if not content_type.startswith("text/xml"):
	raise Exception(
	f"Invalid response from {document_lineage_url}. Expected text/xml, "
	f"got {content_type}"
	)

	return ajax_response.text


	def parse_page_lineage_xml(xml_data):
	"""
	Parses XML data representing a document lineage table, extracting information about
	each document listed.

	Args:
	xml_data (str): The XML data to be parsed.

	Returns:
	list: A list of dictionaries, each containing data extracted from one row of
	the document lineage table.
	"""
	tree = etree.fromstring(bytes(xml_data, "utf-8"))
	data_elements = tree.xpath("//component")
	if len(data_elements) == 0:
	raise Exception("Parsing failed. No components found")

	data_element = data_elements[0]
	cdata_content = data_element.text
	document = html.fromstring(cdata_content)

	headers = [
	header.text_content().strip() for header in document.xpath("//thead/tr/th")
	]
	rows = []

	for row in document.xpath("//tbody/tr"):
	row_data = {}
	for i, td in enumerate(row.findall("td")):
	key = (td.get("class") or headers[i]).replace(" ", "+").lower()
	value = td.text_content().strip()
	row_data[key] = value

	if i == 0:
	a_tag = td.find("*/a")
	href = a_tag.get("href") if a_tag is not None else None
	row_data["ref_url"] = href
	if href:
	id_match = re.search(r"VOLFDNR=(\d+)", href)
	row_data["ref_id"] = id_match.group(1) if id_match else None

	rows.append(row_data)
	logger.debug(f"Parsed data: {rows}")
	return rows


	if __name__ == "__main__":

	logging.basicConfig(level=logging.DEBUG)

	parser = argparse.ArgumentParser(
	description="""\
	Fetches the document lineage table as XML string for a specified paper based on the\
	paper ID. It supports reusing a session ID and page content to avoid unnecessary\
	HTTP requests:\

	If `page_content` AND `session_id` are given, the initial HTTP request is skipped.\
	If the given `session_id` is invalid (AJAX call fails or returns unusable data), it\
	will re-fetch the page HTML document in the second attempt to retrieve data."""
	)
	parser.add_argument(
	"paper_id",
	nargs="?",
	type=int,
	default=2015202,
	help="The paper ID to fetch the AJAX data for. Defaults to 2015202 if not specified.",
	)
	args = parser.parse_args()

	session_id = None
	page_content = None

	document_lineage_data = fetch_document_lineage_data(
	paper_id=args.paper_id, page_content=page_content, session_id=session_id
	)
	logger.debug(
	"Got XML data, parsing...",
	extra={"xml_data": document_lineage_data},
	)
	parsed_data = parse_page_lineage_xml(document_lineage_data)
	logger.debug(
	f"Final parsed data: {parsed_data}",
	extra={"result": parsed_data},
	)