-
-
Save SamHames/795fb3fed71818d871c2873329396ee6 to your computer and use it in GitHub Desktop.
Suggested Schema Evolution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Design notes: | |
- This schema is an attempt to provide affordances for analytical work, while still | |
representing as much as possible as we understand of the underlying data. | |
- All tables have defined primary keys and appropriate foreign key constraints | |
that will be enforced by the database | |
- Selective denormalisation is applied on most of the 'narrow' tables, to | |
allow for certain operations to be performed without a join, or to allow | |
indexes to support some anticipated common operations. The columns added | |
(where appropriate) are: user_id, created_at, to support trend and user | |
counts respectively. | |
- A set of indexes is suggested to support analytical queries | |
- The directly collected table is intended to capture the case when a tweet is | |
directly retrieved as part of an operation, so as to differentiate direct | |
vs referential capture. It is modelled as an append only table, that can | |
be joined with the main tweet table to construct the differentiation and to | |
simplify the insert logic. | |
Examples of anticipated queries and purposes are shown along with each query, | |
and some more extended examples below. | |
*/ | |
create table if not exists user_at_time ( | |
user_id integer, | |
retrieved_at datetime, | |
name text, | |
profile_image_url text, | |
created_at text, | |
protected text, | |
description text, | |
location text, | |
pinned_tweet_id integer references tweet(tweet_id), | |
verified integer, -- boolean | |
url text, | |
username text, | |
followers_count integer, | |
following_count integer, | |
tweet_count integer, | |
listed_count integer, | |
primary key (user_id, retrieved_at) | |
); | |
create table if not exists tweet ( | |
/* | |
Contains the 'immutable' characteristics of a tweet, such as the text, | |
structure of the conversation, the user ID and so on. | |
Mutable characteristics of a tweet are captured in tweet metrics, to | |
allow natural capture of change over time without complicating the | |
main tweet table. Other aspects of tweets such as hashtags and | |
mentions are treated as insert only tables of attributes to handle | |
change over time. | |
Examples: | |
-- Count the number of tweets | |
select count(*) from tweet; | |
-- Select tweets that were 'directly' collected | |
select count(*) | |
from tweet | |
inner join directly_collected_tweet using(tweet_id); | |
-- Find the number of tweets and number of unique users | |
-- by month that were directly collected | |
select | |
datetime(created_at, 'start of month') as month, | |
count(distinct user_id) as active_users, | |
count(*) as tweets | |
from tweet | |
inner join directly_collected_tweet using(tweet_id); | |
-- Find the ten largest threads by number of tweets | |
select | |
conversation_id, | |
count(*) as tweet_count | |
from tweet | |
group by conversation_id | |
order by tweet_count desc limit 10; | |
*/ | |
tweet_id integer primary key, | |
reply_settings text, | |
conversation_id text, | |
created_at text, | |
retweeted_tweet_id text references tweet (id), | |
quoted_tweet_id text references tweet (id), | |
replied_to_tweet_id text references tweet (id), | |
in_reply_to_user_id text references user (id), | |
user_id integer references user_at_time(user_id), | |
text text, | |
lang text, | |
source text, | |
possibly_sensitive integer, -- boolean | |
); | |
create table if not exists directly_collected_tweet( | |
/* | |
Records the IDs of tweets that were 'directly' collected as part of a collection. | |
This allows distinguishing between a tweet that was collected by matching an | |
API call such as a search or user timeline, from a tweet that was only quoted or | |
retweeted, and doesn't actually match the direct operator. | |
Examples: | |
-- The number of directly collected tweets in this collection | |
select | |
count(*) | |
from directly_collected; | |
*/ | |
tweet_id integer primary key references tweet(id) | |
); | |
create table if not exists tweet_metric ( | |
/* | |
Metrics for tweets at the time they were collected - this is most useful | |
for longitudinal collections, or when inserting rehydrated data to | |
capture engagement dynamics. This may not be useful for some bulk or | |
historical collections, where the time of collection is more or less | |
the same across the collection. | |
Examples: | |
-- Select most recent like counts for tweets - this relies | |
-- on documented, but quirky, SQLite behaviour | |
select | |
tweet_id, | |
like_count, | |
max(retrieved_at) as latest_retrieved | |
from tweet_metric | |
group by tweet_id | |
-- Select metric trends for a specific tweet | |
select | |
tweet_id, | |
retrieved_at, | |
quote_count, | |
retweet_count | |
from tweet | |
where tweet_id = 111111111 | |
order by retrieved_at | |
*/ | |
tweet_id integer references tweet(id), | |
retrieved_at datetime, | |
created_at datetime, | |
user_id integer, | |
like_count integer, | |
quote_count integer, | |
reply_count integer, | |
retweet_count integer, | |
primary key (tweet_id, retrieved_at), | |
foreign key (user_id, retrieved_at) references user_at_time | |
); | |
create table if not exists tweet_url ( | |
/* | |
Share tweets associated with URLs. This table include some media | |
objects as well, because Twitter includes them in both the URLs | |
and the attachments part of the data payload. | |
Examples: | |
-- Find the top ten most common urls shared in all tweets | |
select | |
expanded_url, | |
count(*) as tweet_count | |
from tweet_url | |
group by expanded_url | |
order by tweet_count desc | |
limit 10; | |
-- Find the top 10 most retweeted domains, from the directly collected | |
-- portion of the collection | |
select | |
expanded_domain, | |
count(*) as tweet_count | |
from tweet_url | |
inner join directly_collected_tweet using(tweet_id) | |
inner join tweet using(tweet_id) | |
where retweeted_tweet_id is not null | |
group by expanded_domain | |
order by tweet_count desc | |
limit 10; | |
*/ | |
tweet_id integer primary key references tweet(id), | |
url text not null, -- t.co shortened URL | |
expanded_url text, | |
display_url text, | |
-- Extract the expanded domain alone for easy querying/quantification | |
expanded_domain text, | |
created_at datetime, | |
user_id integer | |
); | |
create table if not exists user_profile_url ( | |
user_id integer, | |
retrieved_at datetime, | |
-- This is either description or URL. | |
field text, | |
url text not null, -- t.co shortened URL | |
expanded_url text, | |
expanded_domain text, | |
display_url text, | |
primary key (user_id, retrieved_at, field) | |
foreign key (user_id, retrieved_at) references user_at_time (id, retrieved_at) | |
); | |
create table if not exists tweet_hashtag ( | |
/* | |
Hashtags in tweets, *as collected*, but also a normalised version of the | |
hashtag to represent platform affordances. Twitter considers #SuperBowl | |
and #SuperbOwl to be the same for the purposes of search. This table | |
allows for both kinds of queries, one matching the hashtag as written, | |
the other as interpreted by Twitter. | |
Examples: | |
-- Count the number of tweets containing a hashtag and users | |
-- using a hashtag. | |
select | |
hashtag, | |
count(*) as tweet_count, | |
count(distinct user_id) | |
from tweet_hashtag | |
inner join directly_collected using(tweet_id) | |
group by hashtag; | |
-- Show the normalised hashtag use for January 2021 | |
select | |
hashtag_normalised, | |
count(*) | |
from tweet_hashtag | |
where created_at between '2021-01-01 00:00:00' and '2021-02-01 00:00:00' | |
group by hashtag_normalised; | |
*/ | |
tweet_id integer references tweet(id), | |
user_id integer, | |
created_at datetime, | |
hashtag text not null, | |
hashtag_normalised not null, | |
primary key (tweet_id, hashtag) | |
); | |
create table if not exists user_profile_hashtag ( | |
user_id integer, | |
retrieved_at datetime, | |
hashtag text not null, | |
hashtag_normalised not null, | |
primary key (user_id, retrieved_at, hashtag) | |
foreign key (user_id, retrieved_at) references user_at_time (id, retrieved_at) | |
); | |
create table if not exists tweet_mention ( | |
/* | |
User - user mentions | |
Examples: | |
-- Find the largest (directed) pairs of user mentions | |
select | |
user_id, | |
mentioned_user_id, | |
count(*) as tweet_count | |
from tweet_mention | |
group by user_id, mentioned_user_id | |
*/ | |
tweet_id integer references tweet(id), | |
user_id integer, | |
username text, | |
created_at datetime, | |
mentioned_user_id integer references user_at_time(id), | |
mentioned_username text, | |
primary key (tweet_id, mentioned_user_id) | |
); | |
create table if not exists user_profile_mention ( | |
user_id integer, | |
retrieved_at datetime, | |
hashtag text not null, | |
hashtag_normalised not null, | |
primary key (user_id, retrieved_at, hashtag) | |
foreign key (user_id, retrieved_at) references user_at_time (id, retrieved_at) | |
); | |
create table if not exists media ( | |
media_key text, | |
url text, | |
preview_image_url text, | |
height integer, | |
width integer, | |
type text, | |
duration_ms integer, | |
view_count integer, | |
alt_text text, | |
); | |
create table if not exists media_metric ( | |
media_key text references media(media_key), | |
retrieved_at datetime, | |
view_count integer, | |
primary key (media_key, retrieved_at) | |
); | |
create table if not exists tweet_media ( | |
tweet_id integer references tweet, | |
media_key text references media, | |
user_id integer, | |
created_at datetime, | |
primary key (tweet_id, media_key) | |
); | |
create table if not exists _metadata ( | |
metadata_key text primary key, | |
metadata_value | |
); | |
/* | |
Suggested supporting indexes, for each table to support some anticipated queries. | |
These can be built in a batch at the end of the first full run of inserting | |
data. | |
*/ | |
-- Main tweet table | |
create index if not exists tweet_period on tweet(created_at); | |
create index if not exists tweet_conversation on tweet(conversation_id); | |
-- | |
create index if not exists url_period on tweet_url(created_at); | |
create index if not exists url_user on tweet_url(user_id); | |
create index if not exists url_url on tweet_url(expanded_url); | |
create index if not exists url_domain on tweet_url(expanded_domain); | |
-- | |
create index if not exists hashtag_tweet on tweet_hashtag(hashtag); | |
create index if not exists hashtag_normalised_tweet on tweet_hashtag(hashtag_normalised); | |
create index if not exists hashtag_period on tweet_hashtag(created_at); | |
-- | |
create index if not exists user_mentioning on tweet_mention(user_id, mentioned_user_id); | |
create index if not exists user_mentioned on tweet_mention(mentioned_user_id, user_id); | |
-- Views or materialised views - this could alternatively be a materialised view | |
-- built by triggers or a query at the end of an insert process. | |
create or replace view user_latest as | |
select max(retrieved_at), * | |
from user_at_time uat | |
group by user_id; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment