Skip to content

Instantly share code, notes, and snippets.

@mildsunrise
Last active November 1, 2021 16:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mildsunrise/d176273b9bdfb8e5f056c7758a4feb0d to your computer and use it in GitHub Desktop.
Save mildsunrise/d176273b9bdfb8e5f056c7758a4feb0d to your computer and use it in GitHub Desktop.
πŸ”Ž Parse tweet IDs / URLs
#!/usr/bin/env python3
"""
Simple parser for Twitter-generated IDs and tweet URLs.
ID model taken from Snowflake, which is no longer used:
https://github.com/twitter-archive/snowflake/blob/b3f6a3c6ca8e1b6847baa6ff42bf72201e2c2231/src/main/scala/com/twitter/service/snowflake/IdWorker.scala
Twitter IDs are 64-bit, and have the following structure:
- Bits 63 - 22: timestamp (in ms)
- Bits 21 - 17: datacenter ID
- Bits 16 - 12: worker ID
Usual datacenter values:
- 11: seems to be US / europe?
- 10: seems to be asia?
"""
from typing import NamedTuple, Union
from datetime import datetime, timezone
from urllib.parse import urlsplit, urlunsplit, parse_qs, urlencode
import re
# Value that corresponds to timestamp=0 (Unix time, but milliseconds)
EPOCH = 1288834974657
class TwitterID(NamedTuple):
timestamp: int # 42 bits, from EPOCH
datacenter_id: int # 5 bits
worker_id: int # 5 bits
sequence: int = 0 # 12 bits
@property
def datetime(self):
return datetime.fromtimestamp(self.timestamp / 1000, timezone.utc)
@classmethod
def parse(cls, id: int):
assert isinstance(id, int) and 0 <= id < (1 << 64)
mask = lambda n: ~((~0) << n)
return cls(
sequence = (id >> 0) & mask(12),
worker_id = (id >> 12) & mask( 5),
datacenter_id = (id >> 17) & mask( 5),
timestamp = (id >> 22) + EPOCH,
)
def format(self) -> int:
timestamp = self.timestamp - EPOCH
assert (1 << 42) > timestamp >= 0
assert (1 << 5) > self.datacenter_id >= 0
assert (1 << 5) > self.worker_id >= 0
assert (1 << 12) > self.sequence >= 0
return (
(self.sequence << 0) |
(self.worker_id << 12) |
(self.datacenter_id << 17) |
(timestamp << 22)
)
def parse_tweet_url(x: str):
''' Parse a tweet URL into a (username, TwitterID instance, query dictionary) tuple. '''
SCHEMES = {"http", "https"}
HOSTS = {"twitter.com", "www.twitter.com", "mobile.twitter.com"}
url = urlsplit(x)
m = re.match("/([^/]+)/status/(\\d+)(?:/|$)", url.path)
if not (m and url.scheme in SCHEMES and url.netloc.lower() in HOSTS):
raise Exception(f"Not a tweet URL: {repr(x)}")
username, id = m.groups()
tid = TwitterID.parse(int(id))
assert str(tid.format()) == id
return username, tid, parse_qs(url.query, True)
def format_tweet_url(username: str, id: Union[int, TwitterID], query=None, host='twitter.com', scheme='https'):
assert re.fullmatch(r'[^/]+', username)
if query and isinstance(dict, query):
query = urlencode(query)
if isinstance(id, TwitterID):
id = id.format()
return urlunsplit((scheme, host, f'/{username}/status/{id}', query, None))
if __name__ == '__main__':
import sys
args = sys.argv[1:]
if len(args) != 1:
print('Usage: ./tweet.py [URL]', file=sys.stderr)
exit(1)
username, tid, query = parse_tweet_url(args[0])
datacenter_name = { 11: 'USA / Europe', 10: 'Asia' }.get(tid.datacenter_id, 'unknown')
print('Username:', repr(username))
print('ID:', tid.format())
print(' - Timestamp:', tid.datetime)
print(' - Datacenter:', f'{tid.datacenter_id} ({datacenter_name})')
print(' - Worker ID:', tid.worker_id)
print(' - Sequence:', tid.sequence)
print('Query:', query)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment