Rust1667/download_all_fmhy_rentrys.py

## download_all_fmhy_rentrys.py
import os
import base64
import re
import requests
import rentry


def extract_string_from_url(url):
    # Define the regular expression pattern to match the string after the last '/'
    pattern = r'https://rentry\.(?:co|org)/([^/]+)$'

    # Use re.search to find the match in the URL
    match = re.search(pattern, url)

    if match:
        # Group 1 of the match contains the extracted string
        extracted_string = match.group(1)
        return extracted_string

    return None


def extract_rentry_links(text):
    # Define a regular expression pattern to match rentry.co and rentry.org links
    pattern_rentry = r'\(https:\/\/rentry\.(?:co|org)\/[a-zA-Z0-9\-_]+\)'

    # Find all matches in the text using the pattern
    matches_rentry = re.findall(pattern_rentry, text)
    matches = matches_rentry

    # Remove the parentheses
    links = [match[1:-1] for match in matches]

    return links


def download_raw_content(url):
    print("\ndownloading from " + url + "...")
    response = requests.get(url, timeout=10)
    print("downloaded.\n")
    return response.text


def get_main_wiki_rentry_links():
    url = "https://raw.githubusercontent.com/nbats/FMHYedit/main/single-page"
    text = download_raw_content(url)
    rentry_links = extract_rentry_links(text)
    return rentry_links


def get_base64_rentry_links():
    url = "https://raw.githubusercontent.com/fmhy/FMHYedit/main/base64.md"
    content = download_raw_content(url)

    # Decode base64
    decoded_content = ""
    start = 0

    while True:
        start = content.find("`", start)
        if start == -1:
            break

        end = content.find("`", start + 1)
        if end == -1:
            break

        encoded_string = content[start + 1:end]
        try:
            decoded_bytes = base64.b64decode(encoded_string)
            decoded_content += decoded_bytes.decode("utf-8")
        except base64.binascii.Error:
            decoded_content += f"`{encoded_string}`"

        start = end + 1

    decoded_content = decoded_content.replace("http", "\nhttp")

    links = []
    for line in decoded_content.split("\n"):
        if re.search(r"(rentry\.co|rentry\.org)", line):
            links.append(line)

    return links


def remove_duplicates(strings):
    unique_strings = {}
    result = []

    for s in strings:
        s_lower = s.lower()
        if s_lower not in unique_strings:
            unique_strings[s_lower] = True
            result.append(s)

    print("\nremoved " + str(len(strings) - len(result)) + " duplicates.")
    return result


def get_rentry_content_and_title(url):
    urlstring = extract_string_from_url(url)
    rentry_content_and_title = {
        'title': urlstring,
        'content': rentry.raw(urlstring)['content']
    }
    return rentry_content_and_title


def save_string_to_text_file(string, file_path):
    with open(file_path, "w", encoding='utf-8') as f:
        f.write(string)
    print(f"Result saved to {file_path}")


def main():

    # Links from the main wiki
    main_wiki_rentry_links = get_main_wiki_rentry_links()
    print(str(len(main_wiki_rentry_links)) + " links")
    for link in main_wiki_rentry_links: print(link)

    # Links from base64
    base64_rentry_links = get_base64_rentry_links()
    print(str(len(base64_rentry_links)) + " links")
    for link in base64_rentry_links: print(link)

    # Join lists, remove duplicates, count total
    links = main_wiki_rentry_links + base64_rentry_links
    links = remove_duplicates(links)
    print("\n\n" + str(len(links)) + " links total\n\n")

    # Confirm download in current directory
    print("Current directory: " + os.getcwd() + "\n")
    confirm = input(f"Do you want to download {len(links)} markdown files in the current directory? (y/n): ") == "y"

    # Download files
    if confirm:
        for link in links:
            r = get_rentry_content_and_title(link)
            save_string_to_text_file(r['content'], str(r['title']) + ".md")
    else:
        print("Aborted.")


if __name__ == "__main__":
    main()
	import os
	import base64
	import re
	import requests
	import rentry


	def extract_string_from_url(url):
	# Define the regular expression pattern to match the string after the last '/'
	pattern = r'https://rentry\.(?:co\|org)/([^/]+)$'

	# Use re.search to find the match in the URL
	match = re.search(pattern, url)

	if match:
	# Group 1 of the match contains the extracted string
	extracted_string = match.group(1)
	return extracted_string

	return None


	def extract_rentry_links(text):
	# Define a regular expression pattern to match rentry.co and rentry.org links
	pattern_rentry = r'\(https:\/\/rentry\.(?:co\|org)\/[a-zA-Z0-9\-_]+\)'

	# Find all matches in the text using the pattern
	matches_rentry = re.findall(pattern_rentry, text)
	matches = matches_rentry

	# Remove the parentheses
	links = [match[1:-1] for match in matches]

	return links


	def download_raw_content(url):
	print("\ndownloading from " + url + "...")
	response = requests.get(url, timeout=10)
	print("downloaded.\n")
	return response.text


	def get_main_wiki_rentry_links():
	url = "https://raw.githubusercontent.com/nbats/FMHYedit/main/single-page"
	text = download_raw_content(url)
	rentry_links = extract_rentry_links(text)
	return rentry_links


	def get_base64_rentry_links():
	url = "https://raw.githubusercontent.com/fmhy/FMHYedit/main/base64.md"
	content = download_raw_content(url)

	# Decode base64
	decoded_content = ""
	start = 0

	while True:
	start = content.find("`", start)
	if start == -1:
	break

	end = content.find("`", start + 1)
	if end == -1:
	break

	encoded_string = content[start + 1:end]
	try:
	decoded_bytes = base64.b64decode(encoded_string)
	decoded_content += decoded_bytes.decode("utf-8")
	except base64.binascii.Error:
	decoded_content += f"`{encoded_string}`"

	start = end + 1

	decoded_content = decoded_content.replace("http", "\nhttp")

	links = []
	for line in decoded_content.split("\n"):
	if re.search(r"(rentry\.co\|rentry\.org)", line):
	links.append(line)

	return links


	def remove_duplicates(strings):
	unique_strings = {}
	result = []

	for s in strings:
	s_lower = s.lower()
	if s_lower not in unique_strings:
	unique_strings[s_lower] = True
	result.append(s)

	print("\nremoved " + str(len(strings) - len(result)) + " duplicates.")
	return result


	def get_rentry_content_and_title(url):
	urlstring = extract_string_from_url(url)
	rentry_content_and_title = {
	'title': urlstring,
	'content': rentry.raw(urlstring)['content']
	}
	return rentry_content_and_title


	def save_string_to_text_file(string, file_path):
	with open(file_path, "w", encoding='utf-8') as f:
	f.write(string)
	print(f"Result saved to {file_path}")


	def main():

	# Links from the main wiki
	main_wiki_rentry_links = get_main_wiki_rentry_links()
	print(str(len(main_wiki_rentry_links)) + " links")
	for link in main_wiki_rentry_links: print(link)

	# Links from base64
	base64_rentry_links = get_base64_rentry_links()
	print(str(len(base64_rentry_links)) + " links")
	for link in base64_rentry_links: print(link)

	# Join lists, remove duplicates, count total
	links = main_wiki_rentry_links + base64_rentry_links
	links = remove_duplicates(links)
	print("\n\n" + str(len(links)) + " links total\n\n")

	# Confirm download in current directory
	print("Current directory: " + os.getcwd() + "\n")
	confirm = input(f"Do you want to download {len(links)} markdown files in the current directory? (y/n): ") == "y"

	# Download files
	if confirm:
	for link in links:
	r = get_rentry_content_and_title(link)
	save_string_to_text_file(r['content'], str(r['title']) + ".md")
	else:
	print("Aborted.")


	if __name__ == "__main__":
	main()