Last active
June 21, 2019 08:08
-
-
Save edsu/2f48afe80ebc581b9c22ed04e3f84d26 to your computer and use it in GitHub Desktop.
Deconstruct tweet identifiers from the sample stream. See https://docs.google.com/document/d/1xVrPoNutyqTdQ04DXBEZW4ZW4A5RAQW2he7qIpTmG-M/mobilebasic
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import twarc | |
# This small script shows how to listen to the Twitter sample stream and | |
# deconstruct tweet ids into their various components. The tweet_components | |
# method accepts a tweet id and returns a dict object with key / values | |
# representing the various components of a tweet id. Each component has its own | |
# method detailing how values are extracted from the tweet id. | |
def sequence_id(id): | |
# Return the first 12 bits using a mask | |
# the sequence id are the first 12 LSB bits | |
return id & 0b111111111111 | |
def machine_id(id): | |
# right bitshift 12 and apply an AND mask to get 10 rightmost bits | |
# this is a combination of server id and datacenter id | |
return (id >> 12) & 0b1111111111 | |
def server_id(id): | |
# right bitshift 12 and apply an AND mask to get the next 5 rightmost bits | |
# the server id are bits 13-17 (starting from the right) | |
return (id >> 12) & 0b11111 | |
def datacenter_id(id): | |
# right bitshift 17 and apply an AND mask to get the next 5 rightmost bits | |
# the datacenter id are bits 18-22 (starting from the right) | |
return (id >> 17) & 0b11111 | |
def creation_time(id): | |
# right bitshift 22 and apply Snowflake offset | |
# the epoch time (in milliseconds) are the first 22 MSB bits (first 22 bits starting from the left) | |
return ((id >> 22) + 1288834974657) | |
def tweet_components(tweet_id): | |
tweet_id = int(tweet_id) # Convert to int if str is accidentally passed in | |
c = {} # Components | |
c['sequence_id'] = sequence_id(tweet_id) | |
c['machine_id'] = machine_id(tweet_id) | |
c['server_id'] = server_id(tweet_id) | |
c['datacenter_id'] = datacenter_id(tweet_id) | |
c['creation_time_milli'] = creation_time(tweet_id) | |
return(c) | |
datacenters = set() | |
machines = set() | |
servers = set() | |
def tally(s): | |
datacenters.add(s['datacenter_id']) | |
machines.add('{}-{}'.format(s['datacenter_id'], s['machine_id'])) | |
servers.add('{}-{}-{}'.format(s['datacenter_id'], s['machine_id'], s['server_id'])) | |
try: | |
tweets = twarc.Twarc() | |
for tweet in tweets.sample(): | |
# need to check because deletes come across the sample stream | |
if 'id' in tweet: | |
snowflake = tweet_components(tweet['id']) | |
tally(snowflake) | |
print('{}/{}/{} {}'.format( | |
len(datacenters), | |
len(machines), | |
len(servers), | |
snowflake | |
)) | |
except KeyboardInterrupt: | |
print('servers', servers) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment