Skip to content

Instantly share code, notes, and snippets.

@suryadutta
Created December 1, 2022 22:23
Show Gist options
  • Save suryadutta/f15354157d0af742b52cd37521782954 to your computer and use it in GitHub Desktop.
Save suryadutta/f15354157d0af742b52cd37521782954 to your computer and use it in GitHub Desktop.
Scrapes all release notes and links from Trino release notes
import sys
import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass
from typing import List
SECTIONS_TO_SCRAPE = [
"general",
"security",
"hive-connector",
"iceberg-connector",
"redshift-connector",
]
@dataclass
class ReleaseNote:
release_number: int
section: str
text: str
link: str
def to_csv_line(self):
return ",".join(
[
val.replace(",", " ").replace("\n", " ")
for val in [
str(self.release_number),
self.section,
self.text,
self.link,
]
]
)
@staticmethod
def get_csv_headers():
return "Release, Section, Text, Link"
def scrape_release(release_number: int) -> List[ReleaseNote]:
response = requests.get(
f"https://trino.io/docs/current/release/release-{release_number}.html"
)
soup = BeautifulSoup(response.text, features="html.parser")
release_notes: List[ReleaseNote] = []
for section in SECTIONS_TO_SCRAPE:
section_elem = soup.find("section", id=section)
if section_elem is not None:
for list_elem in section_elem.find_all("li"):
release_notes.append(
ReleaseNote(
release_number=release_number,
section=section,
text=list_elem.find("p").text.replace("\n", " "),
link=list_elem.find("a").get("href"),
)
)
return release_notes
if __name__ == "__main__":
start_release = int(sys.argv[1])
end_release = int(sys.argv[2])
output_filename = f"Trino_release_notes_{start_release}_to_{end_release}.csv"
with open(output_filename, "w") as output:
output.write(ReleaseNote.get_csv_headers() + "\n")
for release_number in range(start_release, end_release + 1):
for release_note in scrape_release(release_number):
output.write(release_note.to_csv_line() + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment