Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save christophmeissner/6f5418705a0a5b81fecb2802fd89c825 to your computer and use it in GitHub Desktop.
Save christophmeissner/6f5418705a0a5b81fecb2802fd89c825 to your computer and use it in GitHub Desktop.
Extract document lineage (aka. as tree) information for a given paper from Leipzig's Ratsinformationssystem "ALLRIS" + OParl (Council Information System)
import argparse
import logging
import re
from urllib.parse import urljoin
import requests
from lxml import etree, html
logger = logging.getLogger(__name__)
def fetch_document_lineage_data(
paper_id,
page_content=None,
session_id=None,
base_url="https://ratsinformation.leipzig.de/allris_leipzig_public/",
):
"""
Fetches the document lineage table as XML string for a specified paper based on the
paper ID. It supports reusing a session ID and page content to avoid unnecessary
HTTP requests:
If `page_content` AND `session_id` are given, the initial HTTP request is skipped.
If the given `session_id` is invalid (AJAX call fails or returns unusable data), it
will re-fetch the page HTML document in the second attempt to retrieve data.
Args:
paper_id (int): The ID of the paper for which to fetch data.
page_content (str, optional): The already loaded page text to avoid redundant
requests. Defaults to None.
session_id (str, optional): The session ID from the loaded page text. Defaults
to None.
base_url (str, optional): The base URL of the website from which to fetch the
data. Defaults to the Leipzig information system URL.
Returns:
str: The document lineage XML data for the paper.
"""
url = urljoin(base=base_url, url=f"./vo020?VOLFDNR={paper_id}&refresh=false")
with requests.Session() as session:
if page_content is None or session_id is None:
logger.debug(f"Fetching page content and session id from {url}")
response = session.get(
url, headers={"DNT": "1", "Accept-Encoding": "gzip, deflate, br"}
)
response.raise_for_status()
page_content = response.text
else:
logger.debug(f"Using pre-fetched page content and session id {session_id}")
session.cookies.set("JSESSIONID", session_id)
# Regex pattern to find the AJAX URL embedded in the page's JavaScript
pattern = r'"u"\s*:\s*"([^\"]*?)"'
match = re.findall(pattern, page_content)
if not match:
raise Exception(
"Could not find document lineage AJAX URL in HTML document."
)
# Assuming the relevant AJAX URL is the last match found
relative_ajax_url = match[-1]
# Resolve the full AJAX URL relative to the base URL
document_lineage_url = urljoin(base=url, url=relative_ajax_url)
headers = {
"Wicket-Ajax": "true",
"Wicket-Ajax-BaseURL": f"vo020?VOLFDNR={paper_id}&refresh=false",
"Accept": "application/xml, text/xml, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br",
"Referer": url,
"DNT": "1",
}
ajax_response = session.get(document_lineage_url, headers=headers)
if session_id is not None:
try:
ajax_response.raise_for_status() # Ensure the request was successful
if "Ajax-Location" in ajax_response.headers:
raise requests.HTTPError("'Ajax-Location' in headers")
except requests.HTTPError as http_error:
logger.debug(
f"HTTP error occurred: {http_error}. Retrying with fresh "
f"session...",
extra=dict(ajax_response.headers),
exc_info=True,
)
# retry with fresh session
return fetch_document_lineage_data(
paper_id=paper_id,
page_content=None,
session_id=None,
base_url=base_url,
)
else:
ajax_response.raise_for_status() # Ensure the request was successful
# Check the response's content type
content_type = ajax_response.headers.get("Content-Type")
if not content_type.startswith("text/xml"):
raise Exception(
f"Invalid response from {document_lineage_url}. Expected text/xml, "
f"got {content_type}"
)
return ajax_response.text
def parse_page_lineage_xml(xml_data):
"""
Parses XML data representing a document lineage table, extracting information about
each document listed.
Args:
xml_data (str): The XML data to be parsed.
Returns:
list: A list of dictionaries, each containing data extracted from one row of
the document lineage table.
"""
tree = etree.fromstring(bytes(xml_data, "utf-8"))
data_elements = tree.xpath("//component")
if len(data_elements) == 0:
raise Exception("Parsing failed. No components found")
data_element = data_elements[0]
cdata_content = data_element.text
document = html.fromstring(cdata_content)
headers = [
header.text_content().strip() for header in document.xpath("//thead/tr/th")
]
rows = []
for row in document.xpath("//tbody/tr"):
row_data = {}
for i, td in enumerate(row.findall("td")):
key = (td.get("class") or headers[i]).replace(" ", "+").lower()
value = td.text_content().strip()
row_data[key] = value
if i == 0:
a_tag = td.find("*/a")
href = a_tag.get("href") if a_tag is not None else None
row_data["ref_url"] = href
if href:
id_match = re.search(r"VOLFDNR=(\d+)", href)
row_data["ref_id"] = id_match.group(1) if id_match else None
rows.append(row_data)
logger.debug(f"Parsed data: {rows}")
return rows
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
parser = argparse.ArgumentParser(
description="""\
Fetches the document lineage table as XML string for a specified paper based on the\
paper ID. It supports reusing a session ID and page content to avoid unnecessary\
HTTP requests:\
If `page_content` AND `session_id` are given, the initial HTTP request is skipped.\
If the given `session_id` is invalid (AJAX call fails or returns unusable data), it\
will re-fetch the page HTML document in the second attempt to retrieve data."""
)
parser.add_argument(
"paper_id",
nargs="?",
type=int,
default=2015202,
help="The paper ID to fetch the AJAX data for. Defaults to 2015202 if not specified.",
)
args = parser.parse_args()
session_id = None
page_content = None
document_lineage_data = fetch_document_lineage_data(
paper_id=args.paper_id, page_content=page_content, session_id=session_id
)
logger.debug(
"Got XML data, parsing...",
extra={"xml_data": document_lineage_data},
)
parsed_data = parse_page_lineage_xml(document_lineage_data)
logger.debug(
f"Final parsed data: {parsed_data}",
extra={"result": parsed_data},
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment