Skip to content

Instantly share code, notes, and snippets.

@chitoge
Created December 30, 2023 21:13
Show Gist options
  • Save chitoge/8dcf846edaa14d30b3c29b5e6f2bead3 to your computer and use it in GitHub Desktop.
Save chitoge/8dcf846edaa14d30b3c29b5e6f2bead3 to your computer and use it in GitHub Desktop.
Downloads Visual Studio 2010 help files to disk
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
base_remote_url = "http://services.mtps.microsoft.com/ServiceAPI/products"
output_directory = "./downloaded_files"
exclude_languages = ["fr-fr", "pt-br", "es-es", "pl-pl", "de-de", "cs-cz", "it-it", "tr-tr", "ru-ru", "ja-jp", "ko-kr", "zh-tw", "zh-cn"]
package_folder_name = "packages"
def download_file(url, local_path: Path):
response = requests.get(url, stream=True)
with open(local_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
f.write(chunk)
print(f"Downloaded: {url} -> {local_path}")
class Package:
def __init__(self, name, actual_url, link_node):
self.name = name
self.actual_url = actual_url
self.link_node = link_node
def write_offline(self, root_path: Path):
package_path = root_path / package_folder_name / self.name
# Ensure the directory exists
package_path.parent.mkdir(parents=True, exist_ok=True)
# If file already exists, skip
if package_path.exists():
print(f" Skipping {self.name}")
else:
# Write the file
download_file(self.actual_url, package_path)
# Modify the link
self.link_node["href"] = package_path.relative_to(root_path).as_posix()
class Book:
def __init__(self, url, parent_node):
# Get the current book from the URL
r = requests.get(url).content.decode("utf-16")
# Save the soup for serialization
self.soup = BeautifulSoup(r, 'html.parser')
self.parent_node = parent_node
self.packages = []
# For each package, create a Package object
for package in self.soup.find_all("div", class_="package"):
link_node = package.find("a", class_="current-link")
package_name = link_node.text
package_url = link_node["href"]
print(f" Package: {package_name} ({package_url})")
self.packages.append(Package(package_name, package_url, link_node))
def write_offline(self, root_path: Path, product_link: str):
# Write the packages
for package in self.packages:
package.write_offline(root_path)
# Write the product link
self.soup.find_all("a", class_="product-link")[0]["href"] = product_link
self.soup.find_all("a", class_="product-group-link")[0]["href"] = "HelpContentSetup.msha"
# Write the book to a file
global book_counter
book_counter += 1
book_path = root_path / f"book{book_counter}.html"
with open(book_path, "w", encoding="utf-8-sig") as f:
f.write(str(self.soup))
print(f" Wrote book to {book_path}")
# Modify the link
self.parent_node.find("a", class_="book-link")["href"] = book_path.relative_to(root_path).as_posix()
class Product:
def __init__(self, url, parent_node):
# Get the current product from the URL
r = requests.get(url).content.decode("utf-16")
# Save the soup for serialization
self.soup = BeautifulSoup(r, 'html.parser')
# Trim all other languages
for tag in self.soup.find_all("span", class_="locale", text=lambda x: x in exclude_languages):
tag.parent.decompose()
self.parent_node = parent_node
self.books = []
# For each book, create a Book object
for book in self.soup.find_all("div", class_="book"):
book_url = urljoin(base_remote_url, book.find("a", class_="book-link")["href"])
print(f" Book: {book_url}")
self.books.append(Book(book_url, book))
def write_offline(self, root_path: Path, counter: int):
# Write the books
for book in self.books:
book.write_offline(root_path, f"product{counter}.html")
# Write the product to a file
product_path = root_path / f"product{counter}.html"
with open(product_path, "w", encoding="utf-8-sig") as f:
f.write(str(self.soup))
print(f"Wrote product to {product_path}")
# Modify the link
self.parent_node.find("a", class_="product-link")["href"] = product_path.relative_to(root_path).as_posix()
class ProductGroup:
def __init__(self, name, url):
self.name = name
# Get the current product tree from the URL
r = requests.get(url).content.decode("utf-16")
soup = BeautifulSoup(r, 'html.parser')
# Lift the product-list div to body, according to the msha file
product_list = soup.find_all("div", class_="product-list")
assert len(product_list) == 1
self.prod_list = product_list[0].extract()
self.prod_list.name = "body"
# For each product, create a Product object
self.products = []
for product in self.prod_list.find_all("div", class_="product"):
product_name = product.find("span", class_="name").text
product_url = urljoin(base_remote_url, product.find("a", class_="product-link")["href"])
print(f" Product: {product_name} ({product_url})")
self.products.append(Product(product_url, product))
def write_offline(self, root_path: Path):
# Write the products
for index, product in enumerate(self.products):
product.write_offline(root_path, index+1)
# New content
soup = BeautifulSoup('<html xmlns="http://www.w3.org/1999/xhtml"><head /><body class="product-list"></body></html>', 'html.parser')
soup.body.replace_with(self.prod_list)
# Write the product group to a file
product_group_path = root_path / "HelpContentSetup.msha"
with open(product_group_path, "w", encoding="utf-8-sig") as f:
f.write(str(soup))
# Get the current product tree from the URL
# There are a bunch of product groups, gonna create a separate folder for each one
current_dir = Path(__file__).parent
r = requests.get(base_remote_url).content.decode("utf-16")
soup = BeautifulSoup(r, 'html.parser')
for group in soup.find_all("div", class_="product-group"):
group_name = group.find("span", class_="name").text
group_url = urljoin(base_remote_url, group.find("a")["href"])
print(f"Product group: {group_name} ({group_url})")
product_group = ProductGroup(group_name, group_url)
# Create a folder for the product group
group_path = current_dir / group_name
group_path.mkdir(parents=True, exist_ok=True)
# Write the product group
book_counter = 0
product_group.write_offline(group_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment