Skip to content

Instantly share code, notes, and snippets.

@luighifeodrippe
Last active January 31, 2023 16:17
Show Gist options
  • Save luighifeodrippe/f1db27695b0c8b112e7857ab8ad459cc to your computer and use it in GitHub Desktop.
Save luighifeodrippe/f1db27695b0c8b112e7857ab8ad459cc to your computer and use it in GitHub Desktop.
Neon
import requests
from bs4 import BeautifulSoup
import os
import re
from telebot import TeleBot
from PIL import Image
from io import BytesIO
import datetime
from xmlrpc import client as xmlrpc_client
import unidecode
from tqdm import tqdm
from colorama import Fore
# Cross-platform colored terminal text.
color_bars = [Fore.BLACK,
Fore.RED,
Fore.GREEN,
Fore.YELLOW,
Fore.BLUE,
Fore.MAGENTA,
Fore.CYAN,
Fore.WHITE]
USERNAME = "luighi"
PASSWORD = "XXXXX"
URL = "https://fiquesabendo.org/"
TOKEN = "5885311028:AAHUwE6aVS73fZm03DksQI52WZ2ctTS5Xkw"
CHAT_ID = -880447273
bot = TeleBot(TOKEN)
def remove_special_characters(string: str) -> str:
"""Remove special characters from string using a regular expression"""
return re.sub(r"[^\w\s]|\?|\!", "", string)
def page_url() -> str:
"""Prompt user to enter page URL"""
url = input("Insira a URL da página: ")
return url
@bot.message_handler(commands=['start', 'help'])
def send_welcome(message):
"""Send welcome message to user"""
bot.reply_to(message, "Olá! Eu sou um bot que pode baixar o título e o conteúdo de uma página da web e salvar em um arquivo de texto. Para usar, envie a URL da página para mim.")
@bot.message_handler(func=lambda message: True)
def download_page(message):
"""Download page title and content and save to text file"""
# Get URL from message sent by user
url = message.text
# Make a GET request to the site
response = requests.get(url)
# Extract HTML content from response
html = response.text
# Create a BeautifulSoup object from the HTML
soup = BeautifulSoup(html, 'html.parser')
# Find the <title> element in the HTML
title_element = soup.find('h1')
# Get content of <title> element
title = title_element.text # type: ignore
# Remove special characters from title
title_mod = remove_special_characters(title)
# Create a directory with the page title
def create_dir():
current_dir = os.path.dirname(__file__)
folder_path = os.path.join(current_dir, title_mod)
# Try to create the directory
try:
os.mkdir(folder_path)
# Change to directory
os.chdir(folder_path)
# If the directory already exists, print an error message
except FileNotFoundError:
print("The file was not found.")
except PermissionError:
print("You do not have permission to access the file.")
# If request is not successful, print response status code
else:
print("Error:", response.status_code) #type: ignore
create_dir()
# Create an empty list to store topics
topics = []
# Loop through all <h3> elements and extract the text from each one
topics_element = soup.find_all('h3')
for topic_element in topics_element:
topic = topic_element.text
topics.append(topic)
#create dir files
imagens = []
imagens_element = soup.find_all("div", class_="main")
for imagem_element in imagens_element:
imagem_element.find('img')
# Open the file in write mode
text_file = open("page_content.txt", "w", encoding="utf-8")
# Write the title to the file
text_file.write(title + "\n")
# Loop through the topics and write each one to the file
for topic in topics:
text_file.write(topic + "\n")
# Close the file
text_file.close()
# Get all <p> elements from the HTML
paragraphs = soup.find_all('p')
# Create a list to store the paragraph texts
paragraph_texts = []
# Loop through the paragraphs and extract the text from each one
for paragraph in paragraphs:
paragraph_text = paragraph.text
# Append the text to the list
paragraph_texts.append(paragraph_text)
# Join the texts in the list into a single string
page_content = '\n'.join(paragraph_texts)
# Remove special characters from page content
page_content_mod = remove_special_characters(page_content)
# Open the file in append mode
text_file = open("page_content.txt", "a", encoding="utf-8")
# Write the page content to the file
text_file.write(page_content_mod)
# Close the file
text_file.close()
# Send message to user indicating success
bot.send_message(
CHAT_ID, "Acabei de extraír com sucesso o conteúdo da página e o salvei em um arquivo de texto.")
# Search class from article content
article_content = soup.find_all("div", class_="main")
# If class exist, get img data from the article content
if article_content:
if not os.path.exists("imagens"):
os.mkdir("imagens")
# Mude para a pasta Imagens
os.chdir("imagens")
# Create a list to store the image URLs
image_urls = []
# Find all <img> elements in the HTML
images = article_content[0].find_all("img")
# Loop through the images and extract the URL from each one
for image in images:
# Get the 'src' attribute of the image
image_url = image['src']
# Verifique se o URL da imagem está faltando o esquema
if not image_url.startswith('http'):
# Adicione o esquema faltante ao URL da imagem
image_url = 'http:' + image_url
# Verifique se o arquivo é uma imagem válida (JPEG ou WEBP)
if not (image_url.endswith('.jpg') or image_url.endswith('.jpeg') or image_url.endswith('.webp')):
continue
# Add the URL to the list
image_urls.append(image_url)
# Set the image file name prefix
image_name_prefix = "___"+unidecode.unidecode(title_mod)
# Set the image file name extension
image_name_extension = ".webp"
# Set the initial value of the image counter
image_counter = 1
# Loop through the image URLs #use tqdm progress bar
for image_url in tqdm(image_urls, desc="Downloading images", bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)):
# Download the image
remove_accentuation = unidecode.unidecode(image_url)
response = requests.get(remove_accentuation)
# Check if the request was successful
if response.status_code == 200:
# Read the image content
image_content = response.content
# Decode the image content to a BytesIO object
image_data = BytesIO(image_content)
# Open the image using PIL
image = Image.open(image_data)
# Generate the image file name
image_name = f"{image_counter}{image_name_prefix}{image_name_extension}".lower()
# Save the image to a file
image.save(image_name)
# Increment the image counter
image_counter += 1
# If the request was not successful
else:
# Print an error message
print("Erro ao baixar imagem: ", response.status_code)
# Send message to user indicating success
bot.send_message(CHAT_ID, "As imagens foram baixadas com sucesso.")
# Crie uma lista vazia para armazenar os nomes das imagens
images = []
# List all files in the current directory
for file in os.listdir():
# Check if file is an image
if file.endswith(".jpg") or file.endswith(".jpeg") or file.endswith(".webp"):
images.append(file)
# Create an XML-RPC client
client = xmlrpc_client.ServerProxy("https://fiquesabendo.org/xmlrpc.php")
# Create a new post
post = client.metaWeblog.newPost(1, USERNAME, PASSWORD, {
'title': title,
'description': topics,
'post_type': 'post',
'post_status': 'draft',
'post_category': ['entretenimento'],
}
)
# Check the return value of the metaWeblog.newPost method
if isinstance(post, dict):
# The return value is a dictionary, which means that the post was created successfully
post_id = post['post_id']
else:
# The return value is a string, which means that it is the ID of the created post
post_id = post
# List all .webp files in the current directory
webp_files = [f for f in os.listdir() if f.endswith('.webp')]
# Iterate over the webp files
for image in tqdm(webp_files, desc="Uploading images", bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)):
# Open the image file
with open(image, 'rb') as img:
img_data = img.read()
data = {
'name': image,
'type': 'image/webp',
'bits': xmlrpc_client.Binary(img_data),
'overwrite': True
}
try:
# Upload the image to the post
response = client.metaWeblog.newMediaObject(post_id, USERNAME, PASSWORD, data)
print("Uploaded: %s" % str(response))
except xmlrpc_client.Fault as e:
print("Error uploading image: %s" % e)
def extract_number(string):
# Use a regular expression to match the number at the beginning of the string
match = re.match(r'(\d+)___', string)
# If a match was found, return the number as an integer
if match:
return int(match.group(1))
# Otherwise, return 0
return 0
sorted_images = sorted(images, key=extract_number)
sorted_topics = sorted(topics, key=extract_number)
output = []
today = datetime.datetime.now()
month = today.month
year = today.year
for topic, image in tqdm(zip(sorted_topics, sorted_images),desc="Making post",bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)):
output.append(
f"<h4> {topic} </h4>\n <img src='https://fiquesabendo.org/wp-content/uploads/{year}/{month}/{image.replace(' ', '-').replace(' ', '-')}' alt = {topic} width = '100 % ' height = 'auto'/>\n\n")
# Join the elements in the output list and save the result to a string
formatted_output = "".join(output)
# Update the post with the formatted string
client.metaWeblog.editPost(post_id, USERNAME, PASSWORD, {
'title': title,
'description': formatted_output,
'post_type': 'post',
'post_status': 'draft',
'post_category': ['entretenimento'],
})
bot.send_message(CHAT_ID, f"A Postagem '{title}' foi criada com sucesso.")
print("Postagem realizada com sucesso!")
# Start the bot
bot.polling()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment