chitoge/vs2010_help_puller.py

## vs2010_help_puller.py
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

base_remote_url = "http://services.mtps.microsoft.com/ServiceAPI/products"
output_directory = "./downloaded_files"
exclude_languages = ["fr-fr", "pt-br", "es-es", "pl-pl", "de-de", "cs-cz", "it-it", "tr-tr", "ru-ru", "ja-jp", "ko-kr", "zh-tw", "zh-cn"]
package_folder_name = "packages"

def download_file(url, local_path: Path):
    response = requests.get(url, stream=True)
    with open(local_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=1024):
            f.write(chunk)
    print(f"Downloaded: {url} -> {local_path}")

class Package:
    def __init__(self, name, actual_url, link_node):
        self.name = name
        self.actual_url = actual_url
        self.link_node = link_node

    def write_offline(self, root_path: Path):
        package_path = root_path / package_folder_name / self.name
        # Ensure the directory exists
        package_path.parent.mkdir(parents=True, exist_ok=True)
        # If file already exists, skip
        if package_path.exists():
            print(f"  Skipping {self.name}")
        else:
            # Write the file
            download_file(self.actual_url, package_path)
        # Modify the link
        self.link_node["href"] = package_path.relative_to(root_path).as_posix()

class Book:
    def __init__(self, url, parent_node):
        # Get the current book from the URL
        r = requests.get(url).content.decode("utf-16")
        # Save the soup for serialization
        self.soup = BeautifulSoup(r, 'html.parser')
        self.parent_node = parent_node
        self.packages = []
        # For each package, create a Package object
        for package in self.soup.find_all("div", class_="package"):
            link_node = package.find("a", class_="current-link")
            package_name = link_node.text
            package_url = link_node["href"]
            print(f"    Package: {package_name} ({package_url})")
            self.packages.append(Package(package_name, package_url, link_node))

    def write_offline(self, root_path: Path, product_link: str):
        # Write the packages
        for package in self.packages:
            package.write_offline(root_path)
        # Write the product link
        self.soup.find_all("a", class_="product-link")[0]["href"] = product_link
        self.soup.find_all("a", class_="product-group-link")[0]["href"] = "HelpContentSetup.msha"
        # Write the book to a file
        global book_counter
        book_counter += 1
        book_path = root_path / f"book{book_counter}.html"
        with open(book_path, "w", encoding="utf-8-sig") as f:
            f.write(str(self.soup))
        print(f"  Wrote book to {book_path}")
        # Modify the link
        self.parent_node.find("a", class_="book-link")["href"] = book_path.relative_to(root_path).as_posix()

class Product:
    def __init__(self, url, parent_node):
        # Get the current product from the URL
        r = requests.get(url).content.decode("utf-16")
        # Save the soup for serialization
        self.soup = BeautifulSoup(r, 'html.parser')
        # Trim all other languages
        for tag in self.soup.find_all("span", class_="locale", text=lambda x: x in exclude_languages):
            tag.parent.decompose()
        self.parent_node = parent_node
        self.books = []
        # For each book, create a Book object
        for book in self.soup.find_all("div", class_="book"):
            book_url = urljoin(base_remote_url, book.find("a", class_="book-link")["href"])
            print(f"  Book: {book_url}")
            self.books.append(Book(book_url, book))

    def write_offline(self, root_path: Path, counter: int):
        # Write the books
        for book in self.books:
            book.write_offline(root_path, f"product{counter}.html")
        # Write the product to a file
        product_path = root_path / f"product{counter}.html"
        with open(product_path, "w", encoding="utf-8-sig") as f:
            f.write(str(self.soup))
        print(f"Wrote product to {product_path}")
        # Modify the link
        self.parent_node.find("a", class_="product-link")["href"] = product_path.relative_to(root_path).as_posix()

class ProductGroup:
    def __init__(self, name, url):
        self.name = name
        # Get the current product tree from the URL

        r = requests.get(url).content.decode("utf-16")
        soup = BeautifulSoup(r, 'html.parser')
        # Lift the product-list div to body, according to the msha file
        product_list = soup.find_all("div", class_="product-list")
        assert len(product_list) == 1
        self.prod_list = product_list[0].extract()
        self.prod_list.name = "body"
        # For each product, create a Product object
        self.products = []
        for product in self.prod_list.find_all("div", class_="product"):
            product_name = product.find("span", class_="name").text
            product_url = urljoin(base_remote_url, product.find("a", class_="product-link")["href"])
            print(f" Product: {product_name} ({product_url})")
            self.products.append(Product(product_url, product))

    def write_offline(self, root_path: Path):
        # Write the products
        for index, product in enumerate(self.products):
            product.write_offline(root_path, index+1)
        # New content
        soup = BeautifulSoup('<html xmlns="http://www.w3.org/1999/xhtml"><head /><body class="product-list"></body></html>', 'html.parser')
        soup.body.replace_with(self.prod_list)
        # Write the product group to a file
        product_group_path = root_path / "HelpContentSetup.msha"
        with open(product_group_path, "w", encoding="utf-8-sig") as f:
            f.write(str(soup))

# Get the current product tree from the URL
# There are a bunch of product groups, gonna create a separate folder for each one
current_dir = Path(__file__).parent
r = requests.get(base_remote_url).content.decode("utf-16")
soup = BeautifulSoup(r, 'html.parser')
for group in soup.find_all("div", class_="product-group"):
    group_name = group.find("span", class_="name").text
    group_url = urljoin(base_remote_url, group.find("a")["href"])
    print(f"Product group: {group_name} ({group_url})")
    product_group = ProductGroup(group_name, group_url)
    # Create a folder for the product group
    group_path = current_dir / group_name
    group_path.mkdir(parents=True, exist_ok=True)
    # Write the product group
    book_counter = 0
    product_group.write_offline(group_path)
	from pathlib import Path
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse

	base_remote_url = "http://services.mtps.microsoft.com/ServiceAPI/products"
	output_directory = "./downloaded_files"
	exclude_languages = ["fr-fr", "pt-br", "es-es", "pl-pl", "de-de", "cs-cz", "it-it", "tr-tr", "ru-ru", "ja-jp", "ko-kr", "zh-tw", "zh-cn"]
	package_folder_name = "packages"

	def download_file(url, local_path: Path):
	response = requests.get(url, stream=True)
	with open(local_path, 'wb') as f:
	for chunk in response.iter_content(chunk_size=1024):
	f.write(chunk)
	print(f"Downloaded: {url} -> {local_path}")

	class Package:
	def __init__(self, name, actual_url, link_node):
	self.name = name
	self.actual_url = actual_url
	self.link_node = link_node

	def write_offline(self, root_path: Path):
	package_path = root_path / package_folder_name / self.name
	# Ensure the directory exists
	package_path.parent.mkdir(parents=True, exist_ok=True)
	# If file already exists, skip
	if package_path.exists():
	print(f" Skipping {self.name}")
	else:
	# Write the file
	download_file(self.actual_url, package_path)
	# Modify the link
	self.link_node["href"] = package_path.relative_to(root_path).as_posix()

	class Book:
	def __init__(self, url, parent_node):
	# Get the current book from the URL
	r = requests.get(url).content.decode("utf-16")
	# Save the soup for serialization
	self.soup = BeautifulSoup(r, 'html.parser')
	self.parent_node = parent_node
	self.packages = []
	# For each package, create a Package object
	for package in self.soup.find_all("div", class_="package"):
	link_node = package.find("a", class_="current-link")
	package_name = link_node.text
	package_url = link_node["href"]
	print(f" Package: {package_name} ({package_url})")
	self.packages.append(Package(package_name, package_url, link_node))

	def write_offline(self, root_path: Path, product_link: str):
	# Write the packages
	for package in self.packages:
	package.write_offline(root_path)
	# Write the product link
	self.soup.find_all("a", class_="product-link")[0]["href"] = product_link
	self.soup.find_all("a", class_="product-group-link")[0]["href"] = "HelpContentSetup.msha"
	# Write the book to a file
	global book_counter
	book_counter += 1
	book_path = root_path / f"book{book_counter}.html"
	with open(book_path, "w", encoding="utf-8-sig") as f:
	f.write(str(self.soup))
	print(f" Wrote book to {book_path}")
	# Modify the link
	self.parent_node.find("a", class_="book-link")["href"] = book_path.relative_to(root_path).as_posix()

	class Product:
	def __init__(self, url, parent_node):
	# Get the current product from the URL
	r = requests.get(url).content.decode("utf-16")
	# Save the soup for serialization
	self.soup = BeautifulSoup(r, 'html.parser')
	# Trim all other languages
	for tag in self.soup.find_all("span", class_="locale", text=lambda x: x in exclude_languages):
	tag.parent.decompose()
	self.parent_node = parent_node
	self.books = []
	# For each book, create a Book object
	for book in self.soup.find_all("div", class_="book"):
	book_url = urljoin(base_remote_url, book.find("a", class_="book-link")["href"])
	print(f" Book: {book_url}")
	self.books.append(Book(book_url, book))

	def write_offline(self, root_path: Path, counter: int):
	# Write the books
	for book in self.books:
	book.write_offline(root_path, f"product{counter}.html")
	# Write the product to a file
	product_path = root_path / f"product{counter}.html"
	with open(product_path, "w", encoding="utf-8-sig") as f:
	f.write(str(self.soup))
	print(f"Wrote product to {product_path}")
	# Modify the link
	self.parent_node.find("a", class_="product-link")["href"] = product_path.relative_to(root_path).as_posix()

	class ProductGroup:
	def __init__(self, name, url):
	self.name = name
	# Get the current product tree from the URL

	r = requests.get(url).content.decode("utf-16")
	soup = BeautifulSoup(r, 'html.parser')
	# Lift the product-list div to body, according to the msha file
	product_list = soup.find_all("div", class_="product-list")
	assert len(product_list) == 1
	self.prod_list = product_list[0].extract()
	self.prod_list.name = "body"
	# For each product, create a Product object
	self.products = []
	for product in self.prod_list.find_all("div", class_="product"):
	product_name = product.find("span", class_="name").text
	product_url = urljoin(base_remote_url, product.find("a", class_="product-link")["href"])
	print(f" Product: {product_name} ({product_url})")
	self.products.append(Product(product_url, product))

	def write_offline(self, root_path: Path):
	# Write the products
	for index, product in enumerate(self.products):
	product.write_offline(root_path, index+1)
	# New content
	soup = BeautifulSoup('<html xmlns="http://www.w3.org/1999/xhtml"><head /><body class="product-list"></body></html>', 'html.parser')
	soup.body.replace_with(self.prod_list)
	# Write the product group to a file
	product_group_path = root_path / "HelpContentSetup.msha"
	with open(product_group_path, "w", encoding="utf-8-sig") as f:
	f.write(str(soup))

	# Get the current product tree from the URL
	# There are a bunch of product groups, gonna create a separate folder for each one
	current_dir = Path(__file__).parent
	r = requests.get(base_remote_url).content.decode("utf-16")
	soup = BeautifulSoup(r, 'html.parser')
	for group in soup.find_all("div", class_="product-group"):
	group_name = group.find("span", class_="name").text
	group_url = urljoin(base_remote_url, group.find("a")["href"])
	print(f"Product group: {group_name} ({group_url})")
	product_group = ProductGroup(group_name, group_url)
	# Create a folder for the product group
	group_path = current_dir / group_name
	group_path.mkdir(parents=True, exist_ok=True)
	# Write the product group
	book_counter = 0
	product_group.write_offline(group_path)