Skip to content

Instantly share code, notes, and snippets.

@SamHames
Created August 9, 2022 01:22
Show Gist options
  • Save SamHames/795fb3fed71818d871c2873329396ee6 to your computer and use it in GitHub Desktop.
Save SamHames/795fb3fed71818d871c2873329396ee6 to your computer and use it in GitHub Desktop.
Suggested Schema Evolution
/*
Design notes:
- This schema is an attempt to provide affordances for analytical work, while still
representing as much as possible as we understand of the underlying data.
- All tables have defined primary keys and appropriate foreign key constraints
that will be enforced by the database
- Selective denormalisation is applied on most of the 'narrow' tables, to
allow for certain operations to be performed without a join, or to allow
indexes to support some anticipated common operations. The columns added
(where appropriate) are: user_id, created_at, to support trend and user
counts respectively.
- A set of indexes is suggested to support analytical queries
- The directly collected table is intended to capture the case when a tweet is
directly retrieved as part of an operation, so as to differentiate direct
vs referential capture. It is modelled as an append only table, that can
be joined with the main tweet table to construct the differentiation and to
simplify the insert logic.
Examples of anticipated queries and purposes are shown along with each query,
and some more extended examples below.
*/
create table if not exists user_at_time (
user_id integer,
retrieved_at datetime,
name text,
profile_image_url text,
created_at text,
protected text,
description text,
location text,
pinned_tweet_id integer references tweet(tweet_id),
verified integer, -- boolean
url text,
username text,
followers_count integer,
following_count integer,
tweet_count integer,
listed_count integer,
primary key (user_id, retrieved_at)
);
create table if not exists tweet (
/*
Contains the 'immutable' characteristics of a tweet, such as the text,
structure of the conversation, the user ID and so on.
Mutable characteristics of a tweet are captured in tweet metrics, to
allow natural capture of change over time without complicating the
main tweet table. Other aspects of tweets such as hashtags and
mentions are treated as insert only tables of attributes to handle
change over time.
Examples:
-- Count the number of tweets
select count(*) from tweet;
-- Select tweets that were 'directly' collected
select count(*)
from tweet
inner join directly_collected_tweet using(tweet_id);
-- Find the number of tweets and number of unique users
-- by month that were directly collected
select
datetime(created_at, 'start of month') as month,
count(distinct user_id) as active_users,
count(*) as tweets
from tweet
inner join directly_collected_tweet using(tweet_id);
-- Find the ten largest threads by number of tweets
select
conversation_id,
count(*) as tweet_count
from tweet
group by conversation_id
order by tweet_count desc limit 10;
*/
tweet_id integer primary key,
reply_settings text,
conversation_id text,
created_at text,
retweeted_tweet_id text references tweet (id),
quoted_tweet_id text references tweet (id),
replied_to_tweet_id text references tweet (id),
in_reply_to_user_id text references user (id),
user_id integer references user_at_time(user_id),
text text,
lang text,
source text,
possibly_sensitive integer, -- boolean
);
create table if not exists directly_collected_tweet(
/*
Records the IDs of tweets that were 'directly' collected as part of a collection.
This allows distinguishing between a tweet that was collected by matching an
API call such as a search or user timeline, from a tweet that was only quoted or
retweeted, and doesn't actually match the direct operator.
Examples:
-- The number of directly collected tweets in this collection
select
count(*)
from directly_collected;
*/
tweet_id integer primary key references tweet(id)
);
create table if not exists tweet_metric (
/*
Metrics for tweets at the time they were collected - this is most useful
for longitudinal collections, or when inserting rehydrated data to
capture engagement dynamics. This may not be useful for some bulk or
historical collections, where the time of collection is more or less
the same across the collection.
Examples:
-- Select most recent like counts for tweets - this relies
-- on documented, but quirky, SQLite behaviour
select
tweet_id,
like_count,
max(retrieved_at) as latest_retrieved
from tweet_metric
group by tweet_id
-- Select metric trends for a specific tweet
select
tweet_id,
retrieved_at,
quote_count,
retweet_count
from tweet
where tweet_id = 111111111
order by retrieved_at
*/
tweet_id integer references tweet(id),
retrieved_at datetime,
created_at datetime,
user_id integer,
like_count integer,
quote_count integer,
reply_count integer,
retweet_count integer,
primary key (tweet_id, retrieved_at),
foreign key (user_id, retrieved_at) references user_at_time
);
create table if not exists tweet_url (
/*
Share tweets associated with URLs. This table include some media
objects as well, because Twitter includes them in both the URLs
and the attachments part of the data payload.
Examples:
-- Find the top ten most common urls shared in all tweets
select
expanded_url,
count(*) as tweet_count
from tweet_url
group by expanded_url
order by tweet_count desc
limit 10;
-- Find the top 10 most retweeted domains, from the directly collected
-- portion of the collection
select
expanded_domain,
count(*) as tweet_count
from tweet_url
inner join directly_collected_tweet using(tweet_id)
inner join tweet using(tweet_id)
where retweeted_tweet_id is not null
group by expanded_domain
order by tweet_count desc
limit 10;
*/
tweet_id integer primary key references tweet(id),
url text not null, -- t.co shortened URL
expanded_url text,
display_url text,
-- Extract the expanded domain alone for easy querying/quantification
expanded_domain text,
created_at datetime,
user_id integer
);
create table if not exists user_profile_url (
user_id integer,
retrieved_at datetime,
-- This is either description or URL.
field text,
url text not null, -- t.co shortened URL
expanded_url text,
expanded_domain text,
display_url text,
primary key (user_id, retrieved_at, field)
foreign key (user_id, retrieved_at) references user_at_time (id, retrieved_at)
);
create table if not exists tweet_hashtag (
/*
Hashtags in tweets, *as collected*, but also a normalised version of the
hashtag to represent platform affordances. Twitter considers #SuperBowl
and #SuperbOwl to be the same for the purposes of search. This table
allows for both kinds of queries, one matching the hashtag as written,
the other as interpreted by Twitter.
Examples:
-- Count the number of tweets containing a hashtag and users
-- using a hashtag.
select
hashtag,
count(*) as tweet_count,
count(distinct user_id)
from tweet_hashtag
inner join directly_collected using(tweet_id)
group by hashtag;
-- Show the normalised hashtag use for January 2021
select
hashtag_normalised,
count(*)
from tweet_hashtag
where created_at between '2021-01-01 00:00:00' and '2021-02-01 00:00:00'
group by hashtag_normalised;
*/
tweet_id integer references tweet(id),
user_id integer,
created_at datetime,
hashtag text not null,
hashtag_normalised not null,
primary key (tweet_id, hashtag)
);
create table if not exists user_profile_hashtag (
user_id integer,
retrieved_at datetime,
hashtag text not null,
hashtag_normalised not null,
primary key (user_id, retrieved_at, hashtag)
foreign key (user_id, retrieved_at) references user_at_time (id, retrieved_at)
);
create table if not exists tweet_mention (
/*
User - user mentions
Examples:
-- Find the largest (directed) pairs of user mentions
select
user_id,
mentioned_user_id,
count(*) as tweet_count
from tweet_mention
group by user_id, mentioned_user_id
*/
tweet_id integer references tweet(id),
user_id integer,
username text,
created_at datetime,
mentioned_user_id integer references user_at_time(id),
mentioned_username text,
primary key (tweet_id, mentioned_user_id)
);
create table if not exists user_profile_mention (
user_id integer,
retrieved_at datetime,
hashtag text not null,
hashtag_normalised not null,
primary key (user_id, retrieved_at, hashtag)
foreign key (user_id, retrieved_at) references user_at_time (id, retrieved_at)
);
create table if not exists media (
media_key text,
url text,
preview_image_url text,
height integer,
width integer,
type text,
duration_ms integer,
view_count integer,
alt_text text,
);
create table if not exists media_metric (
media_key text references media(media_key),
retrieved_at datetime,
view_count integer,
primary key (media_key, retrieved_at)
);
create table if not exists tweet_media (
tweet_id integer references tweet,
media_key text references media,
user_id integer,
created_at datetime,
primary key (tweet_id, media_key)
);
create table if not exists _metadata (
metadata_key text primary key,
metadata_value
);
/*
Suggested supporting indexes, for each table to support some anticipated queries.
These can be built in a batch at the end of the first full run of inserting
data.
*/
-- Main tweet table
create index if not exists tweet_period on tweet(created_at);
create index if not exists tweet_conversation on tweet(conversation_id);
--
create index if not exists url_period on tweet_url(created_at);
create index if not exists url_user on tweet_url(user_id);
create index if not exists url_url on tweet_url(expanded_url);
create index if not exists url_domain on tweet_url(expanded_domain);
--
create index if not exists hashtag_tweet on tweet_hashtag(hashtag);
create index if not exists hashtag_normalised_tweet on tweet_hashtag(hashtag_normalised);
create index if not exists hashtag_period on tweet_hashtag(created_at);
--
create index if not exists user_mentioning on tweet_mention(user_id, mentioned_user_id);
create index if not exists user_mentioned on tweet_mention(mentioned_user_id, user_id);
-- Views or materialised views - this could alternatively be a materialised view
-- built by triggers or a query at the end of an insert process.
create or replace view user_latest as
select max(retrieved_at), *
from user_at_time uat
group by user_id;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment