Created
February 8, 2024 16:06
-
-
Save christophmeissner/6f5418705a0a5b81fecb2802fd89c825 to your computer and use it in GitHub Desktop.
Extract document lineage (aka. as tree) information for a given paper from Leipzig's Ratsinformationssystem "ALLRIS" + OParl (Council Information System)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import logging | |
import re | |
from urllib.parse import urljoin | |
import requests | |
from lxml import etree, html | |
logger = logging.getLogger(__name__) | |
def fetch_document_lineage_data( | |
paper_id, | |
page_content=None, | |
session_id=None, | |
base_url="https://ratsinformation.leipzig.de/allris_leipzig_public/", | |
): | |
""" | |
Fetches the document lineage table as XML string for a specified paper based on the | |
paper ID. It supports reusing a session ID and page content to avoid unnecessary | |
HTTP requests: | |
If `page_content` AND `session_id` are given, the initial HTTP request is skipped. | |
If the given `session_id` is invalid (AJAX call fails or returns unusable data), it | |
will re-fetch the page HTML document in the second attempt to retrieve data. | |
Args: | |
paper_id (int): The ID of the paper for which to fetch data. | |
page_content (str, optional): The already loaded page text to avoid redundant | |
requests. Defaults to None. | |
session_id (str, optional): The session ID from the loaded page text. Defaults | |
to None. | |
base_url (str, optional): The base URL of the website from which to fetch the | |
data. Defaults to the Leipzig information system URL. | |
Returns: | |
str: The document lineage XML data for the paper. | |
""" | |
url = urljoin(base=base_url, url=f"./vo020?VOLFDNR={paper_id}&refresh=false") | |
with requests.Session() as session: | |
if page_content is None or session_id is None: | |
logger.debug(f"Fetching page content and session id from {url}") | |
response = session.get( | |
url, headers={"DNT": "1", "Accept-Encoding": "gzip, deflate, br"} | |
) | |
response.raise_for_status() | |
page_content = response.text | |
else: | |
logger.debug(f"Using pre-fetched page content and session id {session_id}") | |
session.cookies.set("JSESSIONID", session_id) | |
# Regex pattern to find the AJAX URL embedded in the page's JavaScript | |
pattern = r'"u"\s*:\s*"([^\"]*?)"' | |
match = re.findall(pattern, page_content) | |
if not match: | |
raise Exception( | |
"Could not find document lineage AJAX URL in HTML document." | |
) | |
# Assuming the relevant AJAX URL is the last match found | |
relative_ajax_url = match[-1] | |
# Resolve the full AJAX URL relative to the base URL | |
document_lineage_url = urljoin(base=url, url=relative_ajax_url) | |
headers = { | |
"Wicket-Ajax": "true", | |
"Wicket-Ajax-BaseURL": f"vo020?VOLFDNR={paper_id}&refresh=false", | |
"Accept": "application/xml, text/xml, */*; q=0.01", | |
"Accept-Encoding": "gzip, deflate, br", | |
"Referer": url, | |
"DNT": "1", | |
} | |
ajax_response = session.get(document_lineage_url, headers=headers) | |
if session_id is not None: | |
try: | |
ajax_response.raise_for_status() # Ensure the request was successful | |
if "Ajax-Location" in ajax_response.headers: | |
raise requests.HTTPError("'Ajax-Location' in headers") | |
except requests.HTTPError as http_error: | |
logger.debug( | |
f"HTTP error occurred: {http_error}. Retrying with fresh " | |
f"session...", | |
extra=dict(ajax_response.headers), | |
exc_info=True, | |
) | |
# retry with fresh session | |
return fetch_document_lineage_data( | |
paper_id=paper_id, | |
page_content=None, | |
session_id=None, | |
base_url=base_url, | |
) | |
else: | |
ajax_response.raise_for_status() # Ensure the request was successful | |
# Check the response's content type | |
content_type = ajax_response.headers.get("Content-Type") | |
if not content_type.startswith("text/xml"): | |
raise Exception( | |
f"Invalid response from {document_lineage_url}. Expected text/xml, " | |
f"got {content_type}" | |
) | |
return ajax_response.text | |
def parse_page_lineage_xml(xml_data): | |
""" | |
Parses XML data representing a document lineage table, extracting information about | |
each document listed. | |
Args: | |
xml_data (str): The XML data to be parsed. | |
Returns: | |
list: A list of dictionaries, each containing data extracted from one row of | |
the document lineage table. | |
""" | |
tree = etree.fromstring(bytes(xml_data, "utf-8")) | |
data_elements = tree.xpath("//component") | |
if len(data_elements) == 0: | |
raise Exception("Parsing failed. No components found") | |
data_element = data_elements[0] | |
cdata_content = data_element.text | |
document = html.fromstring(cdata_content) | |
headers = [ | |
header.text_content().strip() for header in document.xpath("//thead/tr/th") | |
] | |
rows = [] | |
for row in document.xpath("//tbody/tr"): | |
row_data = {} | |
for i, td in enumerate(row.findall("td")): | |
key = (td.get("class") or headers[i]).replace(" ", "+").lower() | |
value = td.text_content().strip() | |
row_data[key] = value | |
if i == 0: | |
a_tag = td.find("*/a") | |
href = a_tag.get("href") if a_tag is not None else None | |
row_data["ref_url"] = href | |
if href: | |
id_match = re.search(r"VOLFDNR=(\d+)", href) | |
row_data["ref_id"] = id_match.group(1) if id_match else None | |
rows.append(row_data) | |
logger.debug(f"Parsed data: {rows}") | |
return rows | |
if __name__ == "__main__": | |
logging.basicConfig(level=logging.DEBUG) | |
parser = argparse.ArgumentParser( | |
description="""\ | |
Fetches the document lineage table as XML string for a specified paper based on the\ | |
paper ID. It supports reusing a session ID and page content to avoid unnecessary\ | |
HTTP requests:\ | |
If `page_content` AND `session_id` are given, the initial HTTP request is skipped.\ | |
If the given `session_id` is invalid (AJAX call fails or returns unusable data), it\ | |
will re-fetch the page HTML document in the second attempt to retrieve data.""" | |
) | |
parser.add_argument( | |
"paper_id", | |
nargs="?", | |
type=int, | |
default=2015202, | |
help="The paper ID to fetch the AJAX data for. Defaults to 2015202 if not specified.", | |
) | |
args = parser.parse_args() | |
session_id = None | |
page_content = None | |
document_lineage_data = fetch_document_lineage_data( | |
paper_id=args.paper_id, page_content=page_content, session_id=session_id | |
) | |
logger.debug( | |
"Got XML data, parsing...", | |
extra={"xml_data": document_lineage_data}, | |
) | |
parsed_data = parse_page_lineage_xml(document_lineage_data) | |
logger.debug( | |
f"Final parsed data: {parsed_data}", | |
extra={"result": parsed_data}, | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment