Created
December 30, 2023 21:13
-
-
Save chitoge/8dcf846edaa14d30b3c29b5e6f2bead3 to your computer and use it in GitHub Desktop.
Downloads Visual Studio 2010 help files to disk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
base_remote_url = "http://services.mtps.microsoft.com/ServiceAPI/products" | |
output_directory = "./downloaded_files" | |
exclude_languages = ["fr-fr", "pt-br", "es-es", "pl-pl", "de-de", "cs-cz", "it-it", "tr-tr", "ru-ru", "ja-jp", "ko-kr", "zh-tw", "zh-cn"] | |
package_folder_name = "packages" | |
def download_file(url, local_path: Path): | |
response = requests.get(url, stream=True) | |
with open(local_path, 'wb') as f: | |
for chunk in response.iter_content(chunk_size=1024): | |
f.write(chunk) | |
print(f"Downloaded: {url} -> {local_path}") | |
class Package: | |
def __init__(self, name, actual_url, link_node): | |
self.name = name | |
self.actual_url = actual_url | |
self.link_node = link_node | |
def write_offline(self, root_path: Path): | |
package_path = root_path / package_folder_name / self.name | |
# Ensure the directory exists | |
package_path.parent.mkdir(parents=True, exist_ok=True) | |
# If file already exists, skip | |
if package_path.exists(): | |
print(f" Skipping {self.name}") | |
else: | |
# Write the file | |
download_file(self.actual_url, package_path) | |
# Modify the link | |
self.link_node["href"] = package_path.relative_to(root_path).as_posix() | |
class Book: | |
def __init__(self, url, parent_node): | |
# Get the current book from the URL | |
r = requests.get(url).content.decode("utf-16") | |
# Save the soup for serialization | |
self.soup = BeautifulSoup(r, 'html.parser') | |
self.parent_node = parent_node | |
self.packages = [] | |
# For each package, create a Package object | |
for package in self.soup.find_all("div", class_="package"): | |
link_node = package.find("a", class_="current-link") | |
package_name = link_node.text | |
package_url = link_node["href"] | |
print(f" Package: {package_name} ({package_url})") | |
self.packages.append(Package(package_name, package_url, link_node)) | |
def write_offline(self, root_path: Path, product_link: str): | |
# Write the packages | |
for package in self.packages: | |
package.write_offline(root_path) | |
# Write the product link | |
self.soup.find_all("a", class_="product-link")[0]["href"] = product_link | |
self.soup.find_all("a", class_="product-group-link")[0]["href"] = "HelpContentSetup.msha" | |
# Write the book to a file | |
global book_counter | |
book_counter += 1 | |
book_path = root_path / f"book{book_counter}.html" | |
with open(book_path, "w", encoding="utf-8-sig") as f: | |
f.write(str(self.soup)) | |
print(f" Wrote book to {book_path}") | |
# Modify the link | |
self.parent_node.find("a", class_="book-link")["href"] = book_path.relative_to(root_path).as_posix() | |
class Product: | |
def __init__(self, url, parent_node): | |
# Get the current product from the URL | |
r = requests.get(url).content.decode("utf-16") | |
# Save the soup for serialization | |
self.soup = BeautifulSoup(r, 'html.parser') | |
# Trim all other languages | |
for tag in self.soup.find_all("span", class_="locale", text=lambda x: x in exclude_languages): | |
tag.parent.decompose() | |
self.parent_node = parent_node | |
self.books = [] | |
# For each book, create a Book object | |
for book in self.soup.find_all("div", class_="book"): | |
book_url = urljoin(base_remote_url, book.find("a", class_="book-link")["href"]) | |
print(f" Book: {book_url}") | |
self.books.append(Book(book_url, book)) | |
def write_offline(self, root_path: Path, counter: int): | |
# Write the books | |
for book in self.books: | |
book.write_offline(root_path, f"product{counter}.html") | |
# Write the product to a file | |
product_path = root_path / f"product{counter}.html" | |
with open(product_path, "w", encoding="utf-8-sig") as f: | |
f.write(str(self.soup)) | |
print(f"Wrote product to {product_path}") | |
# Modify the link | |
self.parent_node.find("a", class_="product-link")["href"] = product_path.relative_to(root_path).as_posix() | |
class ProductGroup: | |
def __init__(self, name, url): | |
self.name = name | |
# Get the current product tree from the URL | |
r = requests.get(url).content.decode("utf-16") | |
soup = BeautifulSoup(r, 'html.parser') | |
# Lift the product-list div to body, according to the msha file | |
product_list = soup.find_all("div", class_="product-list") | |
assert len(product_list) == 1 | |
self.prod_list = product_list[0].extract() | |
self.prod_list.name = "body" | |
# For each product, create a Product object | |
self.products = [] | |
for product in self.prod_list.find_all("div", class_="product"): | |
product_name = product.find("span", class_="name").text | |
product_url = urljoin(base_remote_url, product.find("a", class_="product-link")["href"]) | |
print(f" Product: {product_name} ({product_url})") | |
self.products.append(Product(product_url, product)) | |
def write_offline(self, root_path: Path): | |
# Write the products | |
for index, product in enumerate(self.products): | |
product.write_offline(root_path, index+1) | |
# New content | |
soup = BeautifulSoup('<html xmlns="http://www.w3.org/1999/xhtml"><head /><body class="product-list"></body></html>', 'html.parser') | |
soup.body.replace_with(self.prod_list) | |
# Write the product group to a file | |
product_group_path = root_path / "HelpContentSetup.msha" | |
with open(product_group_path, "w", encoding="utf-8-sig") as f: | |
f.write(str(soup)) | |
# Get the current product tree from the URL | |
# There are a bunch of product groups, gonna create a separate folder for each one | |
current_dir = Path(__file__).parent | |
r = requests.get(base_remote_url).content.decode("utf-16") | |
soup = BeautifulSoup(r, 'html.parser') | |
for group in soup.find_all("div", class_="product-group"): | |
group_name = group.find("span", class_="name").text | |
group_url = urljoin(base_remote_url, group.find("a")["href"]) | |
print(f"Product group: {group_name} ({group_url})") | |
product_group = ProductGroup(group_name, group_url) | |
# Create a folder for the product group | |
group_path = current_dir / group_name | |
group_path.mkdir(parents=True, exist_ok=True) | |
# Write the product group | |
book_counter = 0 | |
product_group.write_offline(group_path) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment