Skip to content

Instantly share code, notes, and snippets.

@ESWZY
Last active September 28, 2021 21:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ESWZY/c08b719301cbf04d26188f66185fe598 to your computer and use it in GitHub Desktop.
Save ESWZY/c08b719301cbf04d26188f66185fe598 to your computer and use it in GitHub Desktop.
# -*- coding: UTF-8 -*-
import hashlib
import json
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from telegram_news.template import (
InfoExtractor,
NewsPostman,
InfoExtractorJSON,
NewsPostmanJSON
)
from telegram_news.utils import xml_to_json
# Three required fields:
# Your bot token gotten from @BotFather
bot_token = os.getenv("TOKEN")
# Add your bots into a channel as administrators
channel = os.getenv("CHANNEL")
# Your database to store old messages.
DATABASE_URL = os.getenv("DATABASE_URL")
# Create a database session
engine = create_engine(DATABASE_URL)
db = Session(bind=engine.connect())
# The news source
url = "https://en.wikinews.org/wiki/Main_Page"
tag = "Wiki News"
table_name = "wikinews"
# Info extractor to process data format
ie = InfoExtractor()
# Select select element by CSS-based selector
ie.set_list_selector('#MainPage_latest_news_text > ul > li')
ie.set_title_selector('#firstHeading')
ie.set_paragraph_selector('#mw-content-text > div > p:not(p:nth-child(1))')
ie.set_time_selector('#mw-content-text > div > p:nth-child(1) > strong')
ie.set_source_selector('span.sourceTemplate')
# Set a max length for post, Max is 4096
ie.max_post_length = 2000
# News postman to manage sending affair
np = NewsPostman(listURLs=[url, ], sendList=[channel, ], db=db, tag=tag)
np.set_bot_token(bot_token)
np.set_extractor(ie)
np.set_table_name(table_name)
#############################################################
url_2 = "https://www.cnbeta.com/"
tag_2 = "cnBeta"
table_name_2 = "cnbetanews"
ie_2 = InfoExtractor()
ie_2.set_list_selector('.items-area > div > dl > dt > a')
ie_2.set_title_selector('header > h1')
# Select many target at same time
ie_2.set_paragraph_selector('div.cnbeta-article-body > div.article-summary > p, ' # Summary only
'div.cnbeta-article-body > div.article-content > p') # Content only
ie_2.set_time_selector('header > div > span:nth-child(1)')
ie_2.set_source_selector('header > div > span.source')
# Select image to display, then the max length is down to 1024
ie_2.set_image_selector('div.cnbeta-article-body > div.article-summary > p img, ' # From summary only
'div.cnbeta-article-body > div.article-content > p img') # From content only
ie_2.max_post_length = 1000
np_2 = NewsPostman(listURLs=[url_2, ], sendList=[channel], tag=tag_2, db=db)
np_2.set_extractor(ie_2)
np_2.set_table_name(table_name_2)
#############################################################
url_3 = "https://www.scmp.com/rss/91/feed"
tag_3 = "SCMP"
table_name_3 = "scmpnews"
ie_3 = InfoExtractorJSON()
# Pre-process the XML string, convert to JSON string
def list_pre_process(text):
text = json.loads(xml_to_json(text))
return json.dumps(text)
ie_3.set_list_pre_process_policy(list_pre_process)
# Route by key list
ie_3.set_list_router(['rss', 'channel', 'item'])
ie_3.set_link_router(['link'])
ie_3.set_title_router(['title'])
ie_3.set_paragraphs_router(['description'])
ie_3.set_time_router(['pubDate'])
ie_3.set_source_router(['author'])
ie_3.set_image_router(['media:thumbnail', '@url'])
# Customize ID for news item
def id_policy(link):
return hashlib.md5(link.encode("utf-8")).hexdigest()
ie_3.set_id_policy(id_policy)
np_3 = NewsPostmanJSON(listURLs=[url_3], sendList=[channel], db=db, tag=tag_3)
np_3.set_extractor(ie_3)
np_3.set_table_name(table_name_3)
if __name__ == '__main__':
np.poll()
np_2.poll()
np_3.poll()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment