Skip to content

Instantly share code, notes, and snippets.

@jh0ker
Created April 9, 2021 00:07
Show Gist options
  • Save jh0ker/6aaf581c484a2008e03bea91df012396 to your computer and use it in GitHub Desktop.
Save jh0ker/6aaf581c484a2008e03bea91df012396 to your computer and use it in GitHub Desktop.
Tokenize Tweets using entities provided by Twitter API
"""
The original reason I wrote this.
Also provides a more real-world example on how to use the result.
Uses python-telegram-bot for sending messages to Telegram.
"""
import html
from typing import List
from telegram import Bot
from tokenizer import Kind, Token, make_api_client, tokenize
TELEGRAM_TOKEN = ""
def render_for_telegram(tokens: List[Token]) -> str:
"""
Example renderer that ignores media URLs, shows the display URL for regular URLs
and adds some HTML formatting to make user mentions link to the twitter
profile of the mentioned user.
"""
result = ""
for token in tokens:
if token.kind == Kind.USER_MENTION:
# Format user mentions using HTML-style formatting
# See https://core.telegram.org/bots/api#html-style
screen_name = token.data["screen_name"]
result += f'<a href="https://twitter.com/{screen_name}">{token.text}</a>'
elif token.kind == Kind.URL:
# Instead of ugly t.co shortlinks, we can use the display_url
# and expanded_url to create a nice link in Telegram
display_url = token.data["display_url"]
expanded_url = token.data["expanded_url"]
result += f'<a href="{expanded_url}">{display_url}</a>'
elif token.kind == Kind.MEDIA:
# For media URLs, simply do nothing
pass
else:
# Make sure we escape all other text to prevent parsing errors, see link above
result += html.escape(token.text)
return result
if __name__ == "__main__":
api = make_api_client()
# Tweet with some entities and special characters (Emoji, <, > and &)
tweet = api.get_status("1265545575203774465", tweet_mode="extended")
# Tweet with a url entity
# tweet = api.get_status("1349440375711215618", tweet_mode="extended")
# Tweet with media
# tweet = api.get_status("903108707571171328", tweet_mode="extended")
tokens = tokenize(tweet)
print("Custom render of tweet for Telegram:")
print(render_for_telegram(tokens))
# Make Telegram bot instance
bot = Bot(TELEGRAM_TOKEN)
# Send rendered message to Telegram with HTML parse_mode enabled
bot.send_message(
chat_id="10049375",
text=render_for_telegram(tokens),
parse_mode="HTML",
disable_web_page_preview=True,
)
from typing import List
import html
from dataclasses import dataclass
from enum import Enum
import tweepy
API_KEY = ""
API_SECRET = ""
### Utility classes ###
class Kind(Enum):
"""
Describes the kind of entity/token.
String value follows Twitter API names.
"""
TEXT = "text" # Used for regular text between entities
HASHTAG = "hashtags"
SYMBOL = "symbols"
USER_MENTION = "user_mentions"
URL = "urls"
MEDIA = "media"
@dataclass
class Entity:
""" Extracted entity from Tweet JSON """
data: dict
kind: Kind
@dataclass
class Token:
""" Part of the tokenized Tweet """
text: str
data: dict
kind: Kind
### Algorithm for tokenizing tweet ###
def entity_sorter(entity: Entity) -> int:
""" Returns the first (starting) index of the entity for sorting """
return entity.data["indices"][0]
def get_entities(tweet: tweepy.Status) -> List[Entity]:
"""
Put all the different kinds of entities from the tweet into one list,
wrapped in Entity objects and sorted by their start index.
Each Entity object has a flag about which kind it is, depending on which list it came from.
"""
all_entities = []
for entity_kind, entities in tweet.entities.items():
all_entities.extend(
[
Entity(data=entity_data, kind=Kind(entity_kind))
for entity_data in entities
]
)
all_entities = sorted(all_entities, key=entity_sorter)
return all_entities
def tokenize(tweet: tweepy.Status) -> List[Token]:
"""
Uses the entities in the API response to break up the tweet text into parts, called Tokens.
Each token has the text of that part, the original entity data from the API response
and the kind (regular text, hashtag, mention etc.)
"""
tweet_text = tweet.full_text
entities = get_entities(tweet)
tokens = []
current_index = 0
for entity in entities:
start_index, end_index = entity.data["indices"]
# Check if there is regular text between the last
# entity (or the start of the tweet) and this one
# and create a Token for it
if current_index != start_index:
tokens.append(
Token(
html.unescape(tweet_text[current_index:start_index]),
data=None,
kind=Kind.TEXT,
)
)
# Convert Entity objects into Token objects by extracting their text
tokens.append(
Token(
text=tweet_text[start_index:end_index],
data=entity.data,
kind=entity.kind,
)
)
current_index = end_index
# Make sure we get the rest of the tweet as a text token if the tweet either
# - Doesn't end with an entity, or
# - Doesn't have any entities at all
if current_index < len(tweet_text):
tokens.append(
Token(
text=html.unescape(tweet_text[current_index:]),
data=None,
kind=Kind.TEXT,
)
)
return tokens
### Demo ###
def render_simple(tokens: List[Token]) -> str:
"""
Most basic example of rendering text from a list of tokens.
Should always re-create the original text of the tweet.
"""
return "".join(token.text for token in tokens)
def make_api_client():
""" Set up API client for Twitter """
auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)
api = tweepy.API(auth)
return api
if __name__ == "__main__":
api = make_api_client()
# Tweet with some entities and special characters (Emoji, <, > and &)
tweet = api.get_status("1265545575203774465", tweet_mode="extended")
# Tweet with no entities at all
# tweet = api.get_status("1379120489474498560", tweet_mode="extended")
# Run tweet tokenizer
tokens = tokenize(tweet)
# See what the tokens look like
for token in tokens:
print(token, "\n")
print("Simple render of tweet:")
print(render_simple(tokens))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment