Last active
November 1, 2021 16:21
-
-
Save mildsunrise/d176273b9bdfb8e5f056c7758a4feb0d to your computer and use it in GitHub Desktop.
π Parse tweet IDs / URLs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Simple parser for Twitter-generated IDs and tweet URLs. | |
ID model taken from Snowflake, which is no longer used: | |
https://github.com/twitter-archive/snowflake/blob/b3f6a3c6ca8e1b6847baa6ff42bf72201e2c2231/src/main/scala/com/twitter/service/snowflake/IdWorker.scala | |
Twitter IDs are 64-bit, and have the following structure: | |
- Bits 63 - 22: timestamp (in ms) | |
- Bits 21 - 17: datacenter ID | |
- Bits 16 - 12: worker ID | |
Usual datacenter values: | |
- 11: seems to be US / europe? | |
- 10: seems to be asia? | |
""" | |
from typing import NamedTuple, Union | |
from datetime import datetime, timezone | |
from urllib.parse import urlsplit, urlunsplit, parse_qs, urlencode | |
import re | |
# Value that corresponds to timestamp=0 (Unix time, but milliseconds) | |
EPOCH = 1288834974657 | |
class TwitterID(NamedTuple): | |
timestamp: int # 42 bits, from EPOCH | |
datacenter_id: int # 5 bits | |
worker_id: int # 5 bits | |
sequence: int = 0 # 12 bits | |
@property | |
def datetime(self): | |
return datetime.fromtimestamp(self.timestamp / 1000, timezone.utc) | |
@classmethod | |
def parse(cls, id: int): | |
assert isinstance(id, int) and 0 <= id < (1 << 64) | |
mask = lambda n: ~((~0) << n) | |
return cls( | |
sequence = (id >> 0) & mask(12), | |
worker_id = (id >> 12) & mask( 5), | |
datacenter_id = (id >> 17) & mask( 5), | |
timestamp = (id >> 22) + EPOCH, | |
) | |
def format(self) -> int: | |
timestamp = self.timestamp - EPOCH | |
assert (1 << 42) > timestamp >= 0 | |
assert (1 << 5) > self.datacenter_id >= 0 | |
assert (1 << 5) > self.worker_id >= 0 | |
assert (1 << 12) > self.sequence >= 0 | |
return ( | |
(self.sequence << 0) | | |
(self.worker_id << 12) | | |
(self.datacenter_id << 17) | | |
(timestamp << 22) | |
) | |
def parse_tweet_url(x: str): | |
''' Parse a tweet URL into a (username, TwitterID instance, query dictionary) tuple. ''' | |
SCHEMES = {"http", "https"} | |
HOSTS = {"twitter.com", "www.twitter.com", "mobile.twitter.com"} | |
url = urlsplit(x) | |
m = re.match("/([^/]+)/status/(\\d+)(?:/|$)", url.path) | |
if not (m and url.scheme in SCHEMES and url.netloc.lower() in HOSTS): | |
raise Exception(f"Not a tweet URL: {repr(x)}") | |
username, id = m.groups() | |
tid = TwitterID.parse(int(id)) | |
assert str(tid.format()) == id | |
return username, tid, parse_qs(url.query, True) | |
def format_tweet_url(username: str, id: Union[int, TwitterID], query=None, host='twitter.com', scheme='https'): | |
assert re.fullmatch(r'[^/]+', username) | |
if query and isinstance(dict, query): | |
query = urlencode(query) | |
if isinstance(id, TwitterID): | |
id = id.format() | |
return urlunsplit((scheme, host, f'/{username}/status/{id}', query, None)) | |
if __name__ == '__main__': | |
import sys | |
args = sys.argv[1:] | |
if len(args) != 1: | |
print('Usage: ./tweet.py [URL]', file=sys.stderr) | |
exit(1) | |
username, tid, query = parse_tweet_url(args[0]) | |
datacenter_name = { 11: 'USA / Europe', 10: 'Asia' }.get(tid.datacenter_id, 'unknown') | |
print('Username:', repr(username)) | |
print('ID:', tid.format()) | |
print(' - Timestamp:', tid.datetime) | |
print(' - Datacenter:', f'{tid.datacenter_id} ({datacenter_name})') | |
print(' - Worker ID:', tid.worker_id) | |
print(' - Sequence:', tid.sequence) | |
print('Query:', query) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment