SamHames/schema.sql Secret

## schema.sql
/*

Design notes:

- This schema is an attempt to provide affordances for analytical work, while still
  representing as much as possible as we understand of the underlying data.
- All tables have defined primary keys and appropriate foreign key constraints
  that will be enforced by the database
- Selective denormalisation is applied on most of the 'narrow' tables, to
  allow for certain operations to be performed without a join, or to allow
  indexes to support some anticipated common operations. The columns added
  (where appropriate) are: user_id, created_at, to support trend and user
  counts respectively.
- A set of indexes is suggested to support analytical queries
- The directly collected table is intended to capture the case when a tweet is
  directly retrieved as part of an operation, so as to differentiate direct
  vs referential capture. It is modelled as an append only table, that can
  be joined with the main tweet table to construct the differentiation and to
  simplify the insert logic.

Examples of anticipated queries and purposes are shown along with each query,
and some more extended examples below.

*/

create table if not exists user_at_time (
    user_id integer,
    retrieved_at datetime,
    name text,
    profile_image_url text,
    created_at text,
    protected text,
    description text,
    location text,
    pinned_tweet_id integer references tweet(tweet_id),
    verified integer, -- boolean
    url text,
    username text,
    followers_count integer,
    following_count integer,
    tweet_count integer,
    listed_count integer,
    primary key (user_id, retrieved_at)
);

create table if not exists tweet (
    /*
    Contains the 'immutable' characteristics of a tweet, such as the text,
    structure of the conversation, the user ID and so on.

    Mutable characteristics of a tweet are captured in tweet metrics, to
    allow natural capture of change over time without complicating the
    main tweet table. Other aspects of tweets such as hashtags and
    mentions are treated as insert only tables of attributes to handle
    change over time.

    Examples:

    -- Count the number of tweets
    select count(*) from tweet;

    -- Select tweets that were 'directly' collected
    select count(*)
    from tweet
    inner join directly_collected_tweet using(tweet_id);

    -- Find the number of tweets and number of unique users
    -- by month that were directly collected
    select
        datetime(created_at, 'start of month') as month,
        count(distinct user_id) as active_users,
        count(*) as tweets
    from tweet
    inner join directly_collected_tweet using(tweet_id);

    -- Find the ten largest threads by number of tweets
    select
        conversation_id,
        count(*) as tweet_count
    from tweet
    group by conversation_id
    order by tweet_count desc limit 10;

    */
    tweet_id integer primary key,
    reply_settings text,
    conversation_id text,
    created_at text,
    retweeted_tweet_id text references tweet (id),
    quoted_tweet_id text references tweet (id),
    replied_to_tweet_id text references tweet (id),
    in_reply_to_user_id text references user (id),
    user_id integer references user_at_time(user_id),
    text text,
    lang text,
    source text,
    possibly_sensitive integer, -- boolean
);

create table if not exists directly_collected_tweet(
    /*
    Records the IDs of tweets that were 'directly' collected as part of a collection.

    This allows distinguishing between a tweet that was collected by matching an
    API call such as a search or user timeline, from a tweet that was only quoted or
    retweeted, and doesn't actually match the direct operator.

    Examples:

    -- The number of directly collected tweets in this collection
    select
        count(*)
    from directly_collected;

    */
    tweet_id integer primary key references tweet(id)
);

create table if not exists tweet_metric (
    /*
    Metrics for tweets at the time they were collected - this is most useful
    for longitudinal collections, or when inserting rehydrated data to
    capture engagement dynamics. This may not be useful for some bulk or
    historical collections, where the time of collection is more or less
    the same across the collection.

    Examples:

    -- Select most recent like counts for tweets - this relies
    -- on documented, but quirky, SQLite behaviour
    select
        tweet_id,
        like_count,
        max(retrieved_at) as latest_retrieved
    from tweet_metric
    group by tweet_id

    -- Select metric trends for a specific tweet
    select
        tweet_id,
        retrieved_at,
        quote_count,
        retweet_count
    from tweet
    where tweet_id = 111111111
    order by retrieved_at

    */
    tweet_id integer references tweet(id),
    retrieved_at datetime,
    created_at datetime,
    user_id integer,
    like_count integer,
    quote_count integer,
    reply_count integer,
    retweet_count integer,
    primary key (tweet_id, retrieved_at),
    foreign key (user_id, retrieved_at) references user_at_time
);

create table if not exists tweet_url (
    /*
    Share tweets associated with URLs. This table include some media
    objects as well, because Twitter includes them in both the URLs
    and the attachments part of the data payload.

    Examples:

    -- Find the top ten most common urls shared in all tweets
    select
        expanded_url,
        count(*) as tweet_count
    from tweet_url
    group by expanded_url
    order by tweet_count desc
    limit 10;

    -- Find the top 10 most retweeted domains, from the directly collected
    -- portion of the collection
    select
        expanded_domain,
        count(*) as tweet_count
    from tweet_url
    inner join directly_collected_tweet using(tweet_id)
    inner join tweet using(tweet_id)
    where retweeted_tweet_id is not null
    group by expanded_domain
    order by tweet_count desc
    limit 10;

    */
    tweet_id integer primary key references tweet(id),
    url text not null, -- t.co shortened URL
    expanded_url text,
    display_url text,
    -- Extract the expanded domain alone for easy querying/quantification
    expanded_domain text,
    created_at datetime,
    user_id integer
);

create table if not exists user_profile_url (
    user_id integer,
    retrieved_at datetime,
    -- This is either description or URL.
    field text,
    url text not null, -- t.co shortened URL
    expanded_url text,
    expanded_domain text,
    display_url text,
    primary key (user_id, retrieved_at, field)
    foreign key (user_id, retrieved_at) references user_at_time (id, retrieved_at)
);

create table if not exists tweet_hashtag (
    /*
    Hashtags in tweets, *as collected*, but also a normalised version of the
    hashtag to represent platform affordances. Twitter considers #SuperBowl
    and #SuperbOwl to be the same for the purposes of search. This table
    allows for both kinds of queries, one matching the hashtag as written,
    the other as interpreted by Twitter.

    Examples:

    -- Count the number of tweets containing a hashtag and users
    -- using a hashtag.
    select
        hashtag,
        count(*) as tweet_count,
        count(distinct user_id)
    from tweet_hashtag
    inner join directly_collected using(tweet_id)
    group by hashtag;

    -- Show the normalised hashtag use for January 2021
    select
        hashtag_normalised,
        count(*)
    from tweet_hashtag
    where created_at between '2021-01-01 00:00:00' and '2021-02-01 00:00:00'
    group by hashtag_normalised;


    */
    tweet_id integer references tweet(id),
    user_id integer,
    created_at datetime,
    hashtag text not null,
    hashtag_normalised not null,
    primary key (tweet_id, hashtag)
);

create table if not exists user_profile_hashtag (
    user_id integer,
    retrieved_at datetime,
    hashtag text not null,
    hashtag_normalised not null,
    primary key (user_id, retrieved_at, hashtag)
    foreign key (user_id, retrieved_at) references user_at_time (id, retrieved_at)
);

create table if not exists tweet_mention (
    /*
    User - user mentions

    Examples:

    -- Find the largest (directed) pairs of user mentions
    select
        user_id,
        mentioned_user_id,
        count(*) as tweet_count
    from tweet_mention
    group by user_id, mentioned_user_id

    */
    tweet_id integer references tweet(id),
    user_id integer,
    username text,
    created_at datetime,
    mentioned_user_id integer references user_at_time(id),
    mentioned_username text,
    primary key (tweet_id, mentioned_user_id)
);

create table if not exists user_profile_mention (
    user_id integer,
    retrieved_at datetime,
    hashtag text not null,
    hashtag_normalised not null,
    primary key (user_id, retrieved_at, hashtag)
    foreign key (user_id, retrieved_at) references user_at_time (id, retrieved_at)
);


create table if not exists media (
    media_key text,
    url text,
    preview_image_url text,
    height integer,
    width integer,
    type text,
    duration_ms integer,
    view_count integer,
    alt_text text,
);


create table if not exists media_metric (
    media_key text references media(media_key),
    retrieved_at datetime,
    view_count integer,
    primary key (media_key, retrieved_at)
);


create table if not exists tweet_media (
    tweet_id integer references tweet,
    media_key text references media,
    user_id integer,
    created_at datetime,
    primary key (tweet_id, media_key)
);


create table if not exists _metadata (
    metadata_key text primary key,
    metadata_value
);


/*
Suggested supporting indexes, for each table to support some anticipated queries.

These can be built in a batch at the end of the first full run of inserting
data.

*/


-- Main tweet table
create index if not exists tweet_period on tweet(created_at);
create index if not exists tweet_conversation on tweet(conversation_id);

--
create index if not exists url_period on tweet_url(created_at);
create index if not exists url_user on tweet_url(user_id);
create index if not exists url_url on tweet_url(expanded_url);
create index if not exists url_domain on tweet_url(expanded_domain);

--
create index if not exists hashtag_tweet on tweet_hashtag(hashtag);
create index if not exists hashtag_normalised_tweet on tweet_hashtag(hashtag_normalised);
create index if not exists hashtag_period on tweet_hashtag(created_at);

--
create index if not exists user_mentioning on tweet_mention(user_id, mentioned_user_id);
create index if not exists user_mentioned on tweet_mention(mentioned_user_id, user_id);


-- Views or materialised views - this could alternatively be a materialised view
-- built by triggers or a query at the end of an insert process.
create or replace view user_latest as
    select max(retrieved_at), *
    from user_at_time uat
    group by user_id;
	/*

	Design notes:

	- This schema is an attempt to provide affordances for analytical work, while still
	representing as much as possible as we understand of the underlying data.
	- All tables have defined primary keys and appropriate foreign key constraints
	that will be enforced by the database
	- Selective denormalisation is applied on most of the 'narrow' tables, to
	allow for certain operations to be performed without a join, or to allow
	indexes to support some anticipated common operations. The columns added
	(where appropriate) are: user_id, created_at, to support trend and user
	counts respectively.
	- A set of indexes is suggested to support analytical queries
	- The directly collected table is intended to capture the case when a tweet is
	directly retrieved as part of an operation, so as to differentiate direct
	vs referential capture. It is modelled as an append only table, that can
	be joined with the main tweet table to construct the differentiation and to
	simplify the insert logic.

	Examples of anticipated queries and purposes are shown along with each query,
	and some more extended examples below.

	*/

	create table if not exists user_at_time (
	user_id integer,
	retrieved_at datetime,
	name text,
	profile_image_url text,
	created_at text,
	protected text,
	description text,
	location text,
	pinned_tweet_id integer references tweet(tweet_id),
	verified integer, -- boolean
	url text,
	username text,
	followers_count integer,
	following_count integer,
	tweet_count integer,
	listed_count integer,
	primary key (user_id, retrieved_at)
	);

	create table if not exists tweet (
	/*
	Contains the 'immutable' characteristics of a tweet, such as the text,
	structure of the conversation, the user ID and so on.

	Mutable characteristics of a tweet are captured in tweet metrics, to
	allow natural capture of change over time without complicating the
	main tweet table. Other aspects of tweets such as hashtags and
	mentions are treated as insert only tables of attributes to handle
	change over time.

	Examples:

	-- Count the number of tweets
	select count(*) from tweet;

	-- Select tweets that were 'directly' collected
	select count(*)
	from tweet
	inner join directly_collected_tweet using(tweet_id);

	-- Find the number of tweets and number of unique users
	-- by month that were directly collected
	select
	datetime(created_at, 'start of month') as month,
	count(distinct user_id) as active_users,
	count(*) as tweets
	from tweet
	inner join directly_collected_tweet using(tweet_id);

	-- Find the ten largest threads by number of tweets
	select
	conversation_id,
	count(*) as tweet_count
	from tweet
	group by conversation_id
	order by tweet_count desc limit 10;

	*/
	tweet_id integer primary key,
	reply_settings text,
	conversation_id text,
	created_at text,
	retweeted_tweet_id text references tweet (id),
	quoted_tweet_id text references tweet (id),
	replied_to_tweet_id text references tweet (id),
	in_reply_to_user_id text references user (id),
	user_id integer references user_at_time(user_id),
	text text,
	lang text,
	source text,
	possibly_sensitive integer, -- boolean
	);

	create table if not exists directly_collected_tweet(
	/*
	Records the IDs of tweets that were 'directly' collected as part of a collection.

	This allows distinguishing between a tweet that was collected by matching an
	API call such as a search or user timeline, from a tweet that was only quoted or
	retweeted, and doesn't actually match the direct operator.

	Examples:

	-- The number of directly collected tweets in this collection
	select
	count(*)
	from directly_collected;

	*/
	tweet_id integer primary key references tweet(id)
	);

	create table if not exists tweet_metric (
	/*
	Metrics for tweets at the time they were collected - this is most useful
	for longitudinal collections, or when inserting rehydrated data to
	capture engagement dynamics. This may not be useful for some bulk or
	historical collections, where the time of collection is more or less
	the same across the collection.

	Examples:

	-- Select most recent like counts for tweets - this relies
	-- on documented, but quirky, SQLite behaviour
	select
	tweet_id,
	like_count,
	max(retrieved_at) as latest_retrieved
	from tweet_metric
	group by tweet_id

	-- Select metric trends for a specific tweet
	select
	tweet_id,
	retrieved_at,
	quote_count,
	retweet_count
	from tweet
	where tweet_id = 111111111
	order by retrieved_at

	*/
	tweet_id integer references tweet(id),
	retrieved_at datetime,
	created_at datetime,
	user_id integer,
	like_count integer,
	quote_count integer,
	reply_count integer,
	retweet_count integer,
	primary key (tweet_id, retrieved_at),
	foreign key (user_id, retrieved_at) references user_at_time
	);

	create table if not exists tweet_url (
	/*
	Share tweets associated with URLs. This table include some media
	objects as well, because Twitter includes them in both the URLs
	and the attachments part of the data payload.

	Examples:

	-- Find the top ten most common urls shared in all tweets
	select
	expanded_url,
	count(*) as tweet_count
	from tweet_url
	group by expanded_url
	order by tweet_count desc
	limit 10;

	-- Find the top 10 most retweeted domains, from the directly collected
	-- portion of the collection
	select
	expanded_domain,
	count(*) as tweet_count
	from tweet_url
	inner join directly_collected_tweet using(tweet_id)
	inner join tweet using(tweet_id)
	where retweeted_tweet_id is not null
	group by expanded_domain
	order by tweet_count desc
	limit 10;

	*/
	tweet_id integer primary key references tweet(id),
	url text not null, -- t.co shortened URL
	expanded_url text,
	display_url text,
	-- Extract the expanded domain alone for easy querying/quantification
	expanded_domain text,
	created_at datetime,
	user_id integer
	);

	create table if not exists user_profile_url (
	user_id integer,
	retrieved_at datetime,
	-- This is either description or URL.
	field text,
	url text not null, -- t.co shortened URL
	expanded_url text,
	expanded_domain text,
	display_url text,
	primary key (user_id, retrieved_at, field)
	foreign key (user_id, retrieved_at) references user_at_time (id, retrieved_at)
	);

	create table if not exists tweet_hashtag (
	/*
	Hashtags in tweets, as collected, but also a normalised version of the
	hashtag to represent platform affordances. Twitter considers #SuperBowl
	and #SuperbOwl to be the same for the purposes of search. This table
	allows for both kinds of queries, one matching the hashtag as written,
	the other as interpreted by Twitter.

	Examples:

	-- Count the number of tweets containing a hashtag and users
	-- using a hashtag.
	select
	hashtag,
	count(*) as tweet_count,
	count(distinct user_id)
	from tweet_hashtag
	inner join directly_collected using(tweet_id)
	group by hashtag;

	-- Show the normalised hashtag use for January 2021
	select
	hashtag_normalised,
	count(*)
	from tweet_hashtag
	where created_at between '2021-01-01 00:00:00' and '2021-02-01 00:00:00'
	group by hashtag_normalised;


	*/
	tweet_id integer references tweet(id),
	user_id integer,
	created_at datetime,
	hashtag text not null,
	hashtag_normalised not null,
	primary key (tweet_id, hashtag)
	);

	create table if not exists user_profile_hashtag (
	user_id integer,
	retrieved_at datetime,
	hashtag text not null,
	hashtag_normalised not null,
	primary key (user_id, retrieved_at, hashtag)
	foreign key (user_id, retrieved_at) references user_at_time (id, retrieved_at)
	);

	create table if not exists tweet_mention (
	/*
	User - user mentions

	Examples:

	-- Find the largest (directed) pairs of user mentions
	select
	user_id,
	mentioned_user_id,
	count(*) as tweet_count
	from tweet_mention
	group by user_id, mentioned_user_id

	*/
	tweet_id integer references tweet(id),
	user_id integer,
	username text,
	created_at datetime,
	mentioned_user_id integer references user_at_time(id),
	mentioned_username text,
	primary key (tweet_id, mentioned_user_id)
	);

	create table if not exists user_profile_mention (
	user_id integer,
	retrieved_at datetime,
	hashtag text not null,
	hashtag_normalised not null,
	primary key (user_id, retrieved_at, hashtag)
	foreign key (user_id, retrieved_at) references user_at_time (id, retrieved_at)
	);


	create table if not exists media (
	media_key text,
	url text,
	preview_image_url text,
	height integer,
	width integer,
	type text,
	duration_ms integer,
	view_count integer,
	alt_text text,
	);


	create table if not exists media_metric (
	media_key text references media(media_key),
	retrieved_at datetime,
	view_count integer,
	primary key (media_key, retrieved_at)
	);


	create table if not exists tweet_media (
	tweet_id integer references tweet,
	media_key text references media,
	user_id integer,
	created_at datetime,
	primary key (tweet_id, media_key)
	);


	create table if not exists _metadata (
	metadata_key text primary key,
	metadata_value
	);


	/*
	Suggested supporting indexes, for each table to support some anticipated queries.

	These can be built in a batch at the end of the first full run of inserting
	data.

	*/


	-- Main tweet table
	create index if not exists tweet_period on tweet(created_at);
	create index if not exists tweet_conversation on tweet(conversation_id);

	--
	create index if not exists url_period on tweet_url(created_at);
	create index if not exists url_user on tweet_url(user_id);
	create index if not exists url_url on tweet_url(expanded_url);
	create index if not exists url_domain on tweet_url(expanded_domain);

	--
	create index if not exists hashtag_tweet on tweet_hashtag(hashtag);
	create index if not exists hashtag_normalised_tweet on tweet_hashtag(hashtag_normalised);
	create index if not exists hashtag_period on tweet_hashtag(created_at);

	--
	create index if not exists user_mentioning on tweet_mention(user_id, mentioned_user_id);
	create index if not exists user_mentioned on tweet_mention(mentioned_user_id, user_id);


	-- Views or materialised views - this could alternatively be a materialised view
	-- built by triggers or a query at the end of an insert process.
	create or replace view user_latest as
	select max(retrieved_at), *
	from user_at_time uat
	group by user_id;