ESWZY/telegram_news_example.py

## telegram_news_example.py
# -*- coding: UTF-8 -*-
import hashlib
import json
import os

from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from telegram_news.template import (
    InfoExtractor,
    NewsPostman,
    InfoExtractorJSON,
    NewsPostmanJSON
)
from telegram_news.utils import xml_to_json

# Three required fields:
# Your bot token gotten from @BotFather
bot_token = os.getenv("TOKEN")

# Add your bots into a channel as administrators
channel = os.getenv("CHANNEL")

# Your database to store old messages.
DATABASE_URL = os.getenv("DATABASE_URL")

# Create a database session
engine = create_engine(DATABASE_URL)
db = Session(bind=engine.connect())

# The news source
url = "https://en.wikinews.org/wiki/Main_Page"
tag = "Wiki News"
table_name = "wikinews"

# Info extractor to process data format
ie = InfoExtractor()

# Select select element by CSS-based selector
ie.set_list_selector('#MainPage_latest_news_text > ul > li')
ie.set_title_selector('#firstHeading')
ie.set_paragraph_selector('#mw-content-text > div > p:not(p:nth-child(1))')
ie.set_time_selector('#mw-content-text > div > p:nth-child(1) > strong')
ie.set_source_selector('span.sourceTemplate')

# Set a max length for post, Max is 4096
ie.max_post_length = 2000

# News postman to manage sending affair
np = NewsPostman(listURLs=[url, ], sendList=[channel, ], db=db, tag=tag)
np.set_bot_token(bot_token)
np.set_extractor(ie)
np.set_table_name(table_name)

#############################################################

url_2 = "https://www.cnbeta.com/"
tag_2 = "cnBeta"
table_name_2 = "cnbetanews"

ie_2 = InfoExtractor()
ie_2.set_list_selector('.items-area > div > dl > dt > a')
ie_2.set_title_selector('header > h1')

# Select many target at same time
ie_2.set_paragraph_selector('div.cnbeta-article-body > div.article-summary > p, '   # Summary only
                            'div.cnbeta-article-body > div.article-content > p')    # Content only
ie_2.set_time_selector('header > div > span:nth-child(1)')
ie_2.set_source_selector('header > div > span.source')

# Select image to display, then the max length is down to 1024
ie_2.set_image_selector('div.cnbeta-article-body > div.article-summary > p img, '   # From summary only
                        'div.cnbeta-article-body > div.article-content > p img')    # From content only
ie_2.max_post_length = 1000

np_2 = NewsPostman(listURLs=[url_2, ], sendList=[channel], tag=tag_2, db=db)
np_2.set_extractor(ie_2)
np_2.set_table_name(table_name_2)

#############################################################

url_3 = "https://www.scmp.com/rss/91/feed"
tag_3 = "SCMP"
table_name_3 = "scmpnews"

ie_3 = InfoExtractorJSON()

# Pre-process the XML string, convert to JSON string
def list_pre_process(text):
    text = json.loads(xml_to_json(text))
    return json.dumps(text)

ie_3.set_list_pre_process_policy(list_pre_process)

# Route by key list
ie_3.set_list_router(['rss', 'channel', 'item'])
ie_3.set_link_router(['link'])
ie_3.set_title_router(['title'])
ie_3.set_paragraphs_router(['description'])
ie_3.set_time_router(['pubDate'])
ie_3.set_source_router(['author'])
ie_3.set_image_router(['media:thumbnail', '@url'])

# Customize ID for news item
def id_policy(link):
    return hashlib.md5(link.encode("utf-8")).hexdigest()

ie_3.set_id_policy(id_policy)

np_3 = NewsPostmanJSON(listURLs=[url_3], sendList=[channel], db=db, tag=tag_3)
np_3.set_extractor(ie_3)
np_3.set_table_name(table_name_3)

if __name__ == '__main__':
    np.poll()
    np_2.poll()
    np_3.poll()
	# -- coding: UTF-8 --
	import hashlib
	import json
	import os

	from sqlalchemy import create_engine
	from sqlalchemy.orm import Session
	from telegram_news.template import (
	InfoExtractor,
	NewsPostman,
	InfoExtractorJSON,
	NewsPostmanJSON
	)
	from telegram_news.utils import xml_to_json

	# Three required fields:
	# Your bot token gotten from @BotFather
	bot_token = os.getenv("TOKEN")

	# Add your bots into a channel as administrators
	channel = os.getenv("CHANNEL")

	# Your database to store old messages.
	DATABASE_URL = os.getenv("DATABASE_URL")

	# Create a database session
	engine = create_engine(DATABASE_URL)
	db = Session(bind=engine.connect())

	# The news source
	url = "https://en.wikinews.org/wiki/Main_Page"
	tag = "Wiki News"
	table_name = "wikinews"

	# Info extractor to process data format
	ie = InfoExtractor()

	# Select select element by CSS-based selector
	ie.set_list_selector('#MainPage_latest_news_text > ul > li')
	ie.set_title_selector('#firstHeading')
	ie.set_paragraph_selector('#mw-content-text > div > p:not(p:nth-child(1))')
	ie.set_time_selector('#mw-content-text > div > p:nth-child(1) > strong')
	ie.set_source_selector('span.sourceTemplate')

	# Set a max length for post, Max is 4096
	ie.max_post_length = 2000

	# News postman to manage sending affair
	np = NewsPostman(listURLs=[url, ], sendList=[channel, ], db=db, tag=tag)
	np.set_bot_token(bot_token)
	np.set_extractor(ie)
	np.set_table_name(table_name)

	#############################################################

	url_2 = "https://www.cnbeta.com/"
	tag_2 = "cnBeta"
	table_name_2 = "cnbetanews"

	ie_2 = InfoExtractor()
	ie_2.set_list_selector('.items-area > div > dl > dt > a')
	ie_2.set_title_selector('header > h1')

	# Select many target at same time
	ie_2.set_paragraph_selector('div.cnbeta-article-body > div.article-summary > p, ' # Summary only
	'div.cnbeta-article-body > div.article-content > p') # Content only
	ie_2.set_time_selector('header > div > span:nth-child(1)')
	ie_2.set_source_selector('header > div > span.source')

	# Select image to display, then the max length is down to 1024
	ie_2.set_image_selector('div.cnbeta-article-body > div.article-summary > p img, ' # From summary only
	'div.cnbeta-article-body > div.article-content > p img') # From content only
	ie_2.max_post_length = 1000

	np_2 = NewsPostman(listURLs=[url_2, ], sendList=[channel], tag=tag_2, db=db)
	np_2.set_extractor(ie_2)
	np_2.set_table_name(table_name_2)

	#############################################################

	url_3 = "https://www.scmp.com/rss/91/feed"
	tag_3 = "SCMP"
	table_name_3 = "scmpnews"

	ie_3 = InfoExtractorJSON()

	# Pre-process the XML string, convert to JSON string
	def list_pre_process(text):
	text = json.loads(xml_to_json(text))
	return json.dumps(text)

	ie_3.set_list_pre_process_policy(list_pre_process)

	# Route by key list
	ie_3.set_list_router(['rss', 'channel', 'item'])
	ie_3.set_link_router(['link'])
	ie_3.set_title_router(['title'])
	ie_3.set_paragraphs_router(['description'])
	ie_3.set_time_router(['pubDate'])
	ie_3.set_source_router(['author'])
	ie_3.set_image_router(['media:thumbnail', '@url'])

	# Customize ID for news item
	def id_policy(link):
	return hashlib.md5(link.encode("utf-8")).hexdigest()

	ie_3.set_id_policy(id_policy)

	np_3 = NewsPostmanJSON(listURLs=[url_3], sendList=[channel], db=db, tag=tag_3)
	np_3.set_extractor(ie_3)
	np_3.set_table_name(table_name_3)

	if __name__ == '__main__':
	np.poll()
	np_2.poll()
	np_3.poll()