Skip to content

Instantly share code, notes, and snippets.

@edsu
Last active June 21, 2019 08:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save edsu/2f48afe80ebc581b9c22ed04e3f84d26 to your computer and use it in GitHub Desktop.
Save edsu/2f48afe80ebc581b9c22ed04e3f84d26 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import twarc
# This small script shows how to listen to the Twitter sample stream and
# deconstruct tweet ids into their various components. The tweet_components
# method accepts a tweet id and returns a dict object with key / values
# representing the various components of a tweet id. Each component has its own
# method detailing how values are extracted from the tweet id.
def sequence_id(id):
# Return the first 12 bits using a mask
# the sequence id are the first 12 LSB bits
return id & 0b111111111111
def machine_id(id):
# right bitshift 12 and apply an AND mask to get 10 rightmost bits
# this is a combination of server id and datacenter id
return (id >> 12) & 0b1111111111
def server_id(id):
# right bitshift 12 and apply an AND mask to get the next 5 rightmost bits
# the server id are bits 13-17 (starting from the right)
return (id >> 12) & 0b11111
def datacenter_id(id):
# right bitshift 17 and apply an AND mask to get the next 5 rightmost bits
# the datacenter id are bits 18-22 (starting from the right)
return (id >> 17) & 0b11111
def creation_time(id):
# right bitshift 22 and apply Snowflake offset
# the epoch time (in milliseconds) are the first 22 MSB bits (first 22 bits starting from the left)
return ((id >> 22) + 1288834974657)
def tweet_components(tweet_id):
tweet_id = int(tweet_id) # Convert to int if str is accidentally passed in
c = {} # Components
c['sequence_id'] = sequence_id(tweet_id)
c['machine_id'] = machine_id(tweet_id)
c['server_id'] = server_id(tweet_id)
c['datacenter_id'] = datacenter_id(tweet_id)
c['creation_time_milli'] = creation_time(tweet_id)
return(c)
datacenters = set()
machines = set()
servers = set()
def tally(s):
datacenters.add(s['datacenter_id'])
machines.add('{}-{}'.format(s['datacenter_id'], s['machine_id']))
servers.add('{}-{}-{}'.format(s['datacenter_id'], s['machine_id'], s['server_id']))
try:
tweets = twarc.Twarc()
for tweet in tweets.sample():
# need to check because deletes come across the sample stream
if 'id' in tweet:
snowflake = tweet_components(tweet['id'])
tally(snowflake)
print('{}/{}/{} {}'.format(
len(datacenters),
len(machines),
len(servers),
snowflake
))
except KeyboardInterrupt:
print('servers', servers)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment