luighifeodrippe/gist:f1db27695b0c8b112e7857ab8ad459cc

## gistfile1.txt
import requests
from bs4 import BeautifulSoup
import os
import re
from telebot import TeleBot
from PIL import Image
from io import BytesIO
import datetime
from xmlrpc import client as xmlrpc_client
import unidecode
from tqdm import tqdm
from colorama import Fore


# Cross-platform colored terminal text.
color_bars = [Fore.BLACK,
              Fore.RED,
              Fore.GREEN,
              Fore.YELLOW,
              Fore.BLUE,
              Fore.MAGENTA,
              Fore.CYAN,
              Fore.WHITE]

USERNAME = "luighi"
PASSWORD = "XXXXX"
URL = "https://fiquesabendo.org/"
TOKEN = "5885311028:AAHUwE6aVS73fZm03DksQI52WZ2ctTS5Xkw"
CHAT_ID = -880447273
bot = TeleBot(TOKEN)


def remove_special_characters(string: str) -> str:
    """Remove special characters from string using a regular expression"""
    return re.sub(r"[^\w\s]|\?|\!", "", string)


def page_url() -> str:
    """Prompt user to enter page URL"""
    url = input("Insira a URL da página: ")
    return url


@bot.message_handler(commands=['start', 'help'])
def send_welcome(message):
    """Send welcome message to user"""
    bot.reply_to(message, "Olá! Eu sou um bot que pode baixar o título e o conteúdo de uma página da web e salvar em um arquivo de texto. Para usar, envie a URL da página para mim.")


@bot.message_handler(func=lambda message: True)
def download_page(message):
    """Download page title and content and save to text file"""
    # Get URL from message sent by user
    url = message.text

    # Make a GET request to the site
    response = requests.get(url)

    # Extract HTML content from response
    html = response.text

    # Create a BeautifulSoup object from the HTML
    soup = BeautifulSoup(html, 'html.parser')

    # Find the <title> element in the HTML
    title_element = soup.find('h1')

    # Get content of <title> element
    title = title_element.text # type: ignore

    # Remove special characters from title
    title_mod = remove_special_characters(title)

    # Create a directory with the page title

    def create_dir():
        current_dir = os.path.dirname(__file__)
        folder_path = os.path.join(current_dir, title_mod)
        # Try to create the directory
        try:
            os.mkdir(folder_path)
            # Change to directory
            os.chdir(folder_path)
        # If the directory already exists, print an error message
        except FileNotFoundError:
            print("The file was not found.")
        except PermissionError:
            print("You do not have permission to access the file.")
        # If request is not successful, print response status code
        else:
            print("Error:", response.status_code) #type: ignore


    create_dir()

        # Create an empty list to store topics
    topics = []

    # Loop through all <h3> elements and extract the text from each one
    topics_element = soup.find_all('h3')

    for topic_element in topics_element:
        topic = topic_element.text
        topics.append(topic)

    #create dir files

    imagens = []

    imagens_element = soup.find_all("div", class_="main")

    for imagem_element in imagens_element:
        imagem_element.find('img')


    # Open the file in write mode
    text_file = open("page_content.txt", "w", encoding="utf-8")

    # Write the title to the file
    text_file.write(title + "\n")

    # Loop through the topics and write each one to the file
    for topic in topics:
        text_file.write(topic + "\n")

    # Close the file
    text_file.close()

    # Get all <p> elements from the HTML
    paragraphs = soup.find_all('p')

    # Create a list to store the paragraph texts
    paragraph_texts = []

    # Loop through the paragraphs and extract the text from each one
    for paragraph in paragraphs:
        paragraph_text = paragraph.text
        # Append the text to the list
        paragraph_texts.append(paragraph_text)

    # Join the texts in the list into a single string
    page_content = '\n'.join(paragraph_texts)

    # Remove special characters from page content
    page_content_mod = remove_special_characters(page_content)

    # Open the file in append mode
    text_file = open("page_content.txt", "a", encoding="utf-8")

    # Write the page content to the file
    text_file.write(page_content_mod)

    # Close the file
    text_file.close()

    # Send message to user indicating success
    bot.send_message(
        CHAT_ID, "Acabei de extraír com sucesso o conteúdo da página e o salvei em um arquivo de texto.")

    # Search class from article content
    article_content = soup.find_all("div", class_="main")

    # If class exist, get img data from the article content
    if article_content:
        if not os.path.exists("imagens"):
            os.mkdir("imagens")
        # Mude para a pasta Imagens
        os.chdir("imagens")


        # Create a list to store the image URLs
        image_urls = []

        # Find all <img> elements in the HTML
        images = article_content[0].find_all("img")

        # Loop through the images and extract the URL from each one
        for image in images:
            # Get the 'src' attribute of the image
            image_url = image['src']
            # Verifique se o URL da imagem está faltando o esquema
            if not image_url.startswith('http'):
                # Adicione o esquema faltante ao URL da imagem
                image_url = 'http:' + image_url
            # Verifique se o arquivo é uma imagem válida (JPEG ou WEBP)
            if not (image_url.endswith('.jpg') or image_url.endswith('.jpeg') or image_url.endswith('.webp')):
                continue
            # Add the URL to the list
            image_urls.append(image_url)

        # Set the image file name prefix
        image_name_prefix = "___"+unidecode.unidecode(title_mod)

        # Set the image file name extension
        image_name_extension = ".webp"

        # Set the initial value of the image counter
        image_counter = 1

        # Loop through the image URLs #use tqdm progress bar

        for image_url in tqdm(image_urls, desc="Downloading images", bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)):
            # Download the image
            remove_accentuation = unidecode.unidecode(image_url)
            response = requests.get(remove_accentuation)
            # Check if the request was successful
            if response.status_code == 200:
                # Read the image content
                image_content = response.content
                # Decode the image content to a BytesIO object
                image_data = BytesIO(image_content)
                # Open the image using PIL
                image = Image.open(image_data)
                # Generate the image file name
                image_name = f"{image_counter}{image_name_prefix}{image_name_extension}".lower()
                # Save the image to a file
                image.save(image_name)
                # Increment the image counter
                image_counter += 1
            # If the request was not successful
            else:
                # Print an error message
                print("Erro ao baixar imagem: ", response.status_code)
        # Send message to user indicating success
        bot.send_message(CHAT_ID, "As imagens foram baixadas com sucesso.")

    # Crie uma lista vazia para armazenar os nomes das imagens
    images = []

    # List all files in the current directory
    for file in os.listdir():
        # Check if file is an image
        if file.endswith(".jpg") or file.endswith(".jpeg") or file.endswith(".webp"):
            images.append(file)

    # Create an XML-RPC client
    client = xmlrpc_client.ServerProxy("https://fiquesabendo.org/xmlrpc.php")

    # Create a new post
    post = client.metaWeblog.newPost(1, USERNAME, PASSWORD, {
        'title': title,
        'description': topics,
        'post_type': 'post',
        'post_status': 'draft',
        'post_category': ['entretenimento'],
        }
    )

    # Check the return value of the metaWeblog.newPost method
    if isinstance(post, dict):
        # The return value is a dictionary, which means that the post was created successfully
        post_id = post['post_id']
    else:
        # The return value is a string, which means that it is the ID of the created post
        post_id = post

    # List all .webp files in the current directory
    webp_files = [f for f in os.listdir() if f.endswith('.webp')]

    # Iterate over the webp files
    for image in tqdm(webp_files, desc="Uploading images", bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)):
        # Open the image file
        with open(image, 'rb') as img:
            img_data = img.read()
            data = {
                'name': image,
                'type': 'image/webp',
                'bits': xmlrpc_client.Binary(img_data),
                'overwrite': True
            }

        try:
            # Upload the image to the post
            response = client.metaWeblog.newMediaObject(post_id, USERNAME, PASSWORD, data)
            print("Uploaded: %s" % str(response))
        except xmlrpc_client.Fault as e:
            print("Error uploading image: %s" % e)


    def extract_number(string):
        # Use a regular expression to match the number at the beginning of the string
        match = re.match(r'(\d+)___', string)
        # If a match was found, return the number as an integer
        if match:
            return int(match.group(1))
        # Otherwise, return 0
        return 0

    sorted_images = sorted(images, key=extract_number)

    sorted_topics = sorted(topics, key=extract_number)

    output = []

    today = datetime.datetime.now()
    month = today.month
    year = today.year

    for topic, image in tqdm(zip(sorted_topics, sorted_images),desc="Making post",bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)):
        output.append(
            f"<h4> {topic} </h4>\n <img src='https://fiquesabendo.org/wp-content/uploads/{year}/{month}/{image.replace('  ', '-').replace(' ', '-')}' alt = {topic} width = '100 % ' height = 'auto'/>\n\n")


    # Join the elements in the output list and save the result to a string
    formatted_output = "".join(output)

    # Update the post with the formatted string
    client.metaWeblog.editPost(post_id, USERNAME, PASSWORD, {
        'title': title,
        'description': formatted_output,
        'post_type': 'post',
        'post_status': 'draft',
        'post_category': ['entretenimento'],
    })

    bot.send_message(CHAT_ID, f"A Postagem '{title}' foi criada com sucesso.")

    print("Postagem realizada com sucesso!")

# Start the bot
bot.polling()
	import requests
	from bs4 import BeautifulSoup
	import os
	import re
	from telebot import TeleBot
	from PIL import Image
	from io import BytesIO
	import datetime
	from xmlrpc import client as xmlrpc_client
	import unidecode
	from tqdm import tqdm
	from colorama import Fore



	# Cross-platform colored terminal text.
	color_bars = [Fore.BLACK,
	Fore.RED,
	Fore.GREEN,
	Fore.YELLOW,
	Fore.BLUE,
	Fore.MAGENTA,
	Fore.CYAN,
	Fore.WHITE]

	USERNAME = "luighi"
	PASSWORD = "XXXXX"
	URL = "https://fiquesabendo.org/"
	TOKEN = "5885311028:AAHUwE6aVS73fZm03DksQI52WZ2ctTS5Xkw"
	CHAT_ID = -880447273
	bot = TeleBot(TOKEN)


	def remove_special_characters(string: str) -> str:
	"""Remove special characters from string using a regular expression"""
	return re.sub(r"[^\w\s]\|\?\|\!", "", string)


	def page_url() -> str:
	"""Prompt user to enter page URL"""
	url = input("Insira a URL da página: ")
	return url


	@bot.message_handler(commands=['start', 'help'])
	def send_welcome(message):
	"""Send welcome message to user"""
	bot.reply_to(message, "Olá! Eu sou um bot que pode baixar o título e o conteúdo de uma página da web e salvar em um arquivo de texto. Para usar, envie a URL da página para mim.")


	@bot.message_handler(func=lambda message: True)
	def download_page(message):
	"""Download page title and content and save to text file"""
	# Get URL from message sent by user
	url = message.text

	# Make a GET request to the site
	response = requests.get(url)

	# Extract HTML content from response
	html = response.text

	# Create a BeautifulSoup object from the HTML
	soup = BeautifulSoup(html, 'html.parser')

	# Find the <title> element in the HTML
	title_element = soup.find('h1')

	# Get content of <title> element
	title = title_element.text # type: ignore

	# Remove special characters from title
	title_mod = remove_special_characters(title)

	# Create a directory with the page title

	def create_dir():
	current_dir = os.path.dirname(__file__)
	folder_path = os.path.join(current_dir, title_mod)
	# Try to create the directory
	try:
	os.mkdir(folder_path)
	# Change to directory
	os.chdir(folder_path)
	# If the directory already exists, print an error message
	except FileNotFoundError:
	print("The file was not found.")
	except PermissionError:
	print("You do not have permission to access the file.")
	# If request is not successful, print response status code
	else:
	print("Error:", response.status_code) #type: ignore


	create_dir()

	# Create an empty list to store topics
	topics = []

	# Loop through all <h3> elements and extract the text from each one
	topics_element = soup.find_all('h3')

	for topic_element in topics_element:
	topic = topic_element.text
	topics.append(topic)

	#create dir files

	imagens = []

	imagens_element = soup.find_all("div", class_="main")

	for imagem_element in imagens_element:
	imagem_element.find('img')


	# Open the file in write mode
	text_file = open("page_content.txt", "w", encoding="utf-8")

	# Write the title to the file
	text_file.write(title + "\n")

	# Loop through the topics and write each one to the file
	for topic in topics:
	text_file.write(topic + "\n")

	# Close the file
	text_file.close()

	# Get all <p> elements from the HTML
	paragraphs = soup.find_all('p')

	# Create a list to store the paragraph texts
	paragraph_texts = []

	# Loop through the paragraphs and extract the text from each one
	for paragraph in paragraphs:
	paragraph_text = paragraph.text
	# Append the text to the list
	paragraph_texts.append(paragraph_text)

	# Join the texts in the list into a single string
	page_content = '\n'.join(paragraph_texts)

	# Remove special characters from page content
	page_content_mod = remove_special_characters(page_content)

	# Open the file in append mode
	text_file = open("page_content.txt", "a", encoding="utf-8")

	# Write the page content to the file
	text_file.write(page_content_mod)

	# Close the file
	text_file.close()

	# Send message to user indicating success
	bot.send_message(
	CHAT_ID, "Acabei de extraír com sucesso o conteúdo da página e o salvei em um arquivo de texto.")

	# Search class from article content
	article_content = soup.find_all("div", class_="main")

	# If class exist, get img data from the article content
	if article_content:
	if not os.path.exists("imagens"):
	os.mkdir("imagens")
	# Mude para a pasta Imagens
	os.chdir("imagens")


	# Create a list to store the image URLs
	image_urls = []

	# Find all <img> elements in the HTML
	images = article_content[0].find_all("img")

	# Loop through the images and extract the URL from each one
	for image in images:
	# Get the 'src' attribute of the image
	image_url = image['src']
	# Verifique se o URL da imagem está faltando o esquema
	if not image_url.startswith('http'):
	# Adicione o esquema faltante ao URL da imagem
	image_url = 'http:' + image_url
	# Verifique se o arquivo é uma imagem válida (JPEG ou WEBP)
	if not (image_url.endswith('.jpg') or image_url.endswith('.jpeg') or image_url.endswith('.webp')):
	continue
	# Add the URL to the list
	image_urls.append(image_url)

	# Set the image file name prefix
	image_name_prefix = "___"+unidecode.unidecode(title_mod)

	# Set the image file name extension
	image_name_extension = ".webp"

	# Set the initial value of the image counter
	image_counter = 1

	# Loop through the image URLs #use tqdm progress bar

	for image_url in tqdm(image_urls, desc="Downloading images", bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)):
	# Download the image
	remove_accentuation = unidecode.unidecode(image_url)
	response = requests.get(remove_accentuation)
	# Check if the request was successful
	if response.status_code == 200:
	# Read the image content
	image_content = response.content
	# Decode the image content to a BytesIO object
	image_data = BytesIO(image_content)
	# Open the image using PIL
	image = Image.open(image_data)
	# Generate the image file name
	image_name = f"{image_counter}{image_name_prefix}{image_name_extension}".lower()
	# Save the image to a file
	image.save(image_name)
	# Increment the image counter
	image_counter += 1
	# If the request was not successful
	else:
	# Print an error message
	print("Erro ao baixar imagem: ", response.status_code)
	# Send message to user indicating success
	bot.send_message(CHAT_ID, "As imagens foram baixadas com sucesso.")

	# Crie uma lista vazia para armazenar os nomes das imagens
	images = []

	# List all files in the current directory
	for file in os.listdir():
	# Check if file is an image
	if file.endswith(".jpg") or file.endswith(".jpeg") or file.endswith(".webp"):
	images.append(file)

	# Create an XML-RPC client
	client = xmlrpc_client.ServerProxy("https://fiquesabendo.org/xmlrpc.php")

	# Create a new post
	post = client.metaWeblog.newPost(1, USERNAME, PASSWORD, {
	'title': title,
	'description': topics,
	'post_type': 'post',
	'post_status': 'draft',
	'post_category': ['entretenimento'],
	}
	)

	# Check the return value of the metaWeblog.newPost method
	if isinstance(post, dict):
	# The return value is a dictionary, which means that the post was created successfully
	post_id = post['post_id']
	else:
	# The return value is a string, which means that it is the ID of the created post
	post_id = post

	# List all .webp files in the current directory
	webp_files = [f for f in os.listdir() if f.endswith('.webp')]

	# Iterate over the webp files
	for image in tqdm(webp_files, desc="Uploading images", bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)):
	# Open the image file
	with open(image, 'rb') as img:
	img_data = img.read()
	data = {
	'name': image,
	'type': 'image/webp',
	'bits': xmlrpc_client.Binary(img_data),
	'overwrite': True
	}

	try:
	# Upload the image to the post
	response = client.metaWeblog.newMediaObject(post_id, USERNAME, PASSWORD, data)
	print("Uploaded: %s" % str(response))
	except xmlrpc_client.Fault as e:
	print("Error uploading image: %s" % e)


	def extract_number(string):
	# Use a regular expression to match the number at the beginning of the string
	match = re.match(r'(\d+)___', string)
	# If a match was found, return the number as an integer
	if match:
	return int(match.group(1))
	# Otherwise, return 0
	return 0

	sorted_images = sorted(images, key=extract_number)

	sorted_topics = sorted(topics, key=extract_number)

	output = []

	today = datetime.datetime.now()
	month = today.month
	year = today.year

	for topic, image in tqdm(zip(sorted_topics, sorted_images),desc="Making post",bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)):
	output.append(
	f"<h4> {topic} </h4>\n <img src='https://fiquesabendo.org/wp-content/uploads/{year}/{month}/{image.replace(' ', '-').replace(' ', '-')}' alt = {topic} width = '100 % ' height = 'auto'/>\n\n")


	# Join the elements in the output list and save the result to a string
	formatted_output = "".join(output)

	# Update the post with the formatted string
	client.metaWeblog.editPost(post_id, USERNAME, PASSWORD, {
	'title': title,
	'description': formatted_output,
	'post_type': 'post',
	'post_status': 'draft',
	'post_category': ['entretenimento'],
	})

	bot.send_message(CHAT_ID, f"A Postagem '{title}' foi criada com sucesso.")

	print("Postagem realizada com sucesso!")

	# Start the bot
	bot.polling()