kalyantm/doctype-db.sql

## doctype-db.sql
/*
2)  Write the DDL for creating the tables for a MariaDB database with users, tweets and followers.

A user has a username, password, email, first name and last name.
A tweet has text, a publication date and is posted by a user.
A user can be followed by other users.

*/

CREATE TABLE users (
  user_id INT AUTO_INCREMENT PRIMARY KEY,
  username varchar(32) UNIQUE,
  first_name varchar(64) NOT NULL,
  last_name varchar(64),
  email varchar(64) NOT NULL UNIQUE,
  password varchar(256) NOT NULL
);

CREATE TABLE tweets (
  id INT AUTO_INCREMENT PRIMARY KEY,
  text varchar(280) NOT NULL,
  published_date TIMESTAMP,
  user_id int,
  FOREIGN KEY (user_id) REFERENCES users(user_id) ON DELETE SET NULL
);


CREATE TABLE relationship (
  user_id int NOT NULL,
  follows_id int,
  PRIMARY KEY (user_id, follows_id),
  FOREIGN KEY (user_id) REFERENCES users(user_id) ON DELETE CASCADE,
  FOREIGN KEY (follows_id) REFERENCES users(user_id) ON DELETE CASCADE
);


/*

2.1) Write a SQL query that returns the 30 latest tweets by users followed by the user with username "Mark". The result must include username, first name, last name, tweet text and publication date.

// The following query fetches the required data from the 3 tables, by performing a join and orders the latest 30 tweets
// by timestamp

*/

SELECT users.username, users.first_name, users.last_name, tweets.text, tweets.published_date
FROM tweets
JOIN users ON tweets.user_id = users.user_id
WHERE tweets.user_id in (
  SELECT follows_id FROM relationship where relationship.user_id = ( SELECT user_id from users where users.username = 'Mark')
)
ORDER BY published_date DESC
LIMIT 0,30;

/*
2.2) One minute later, Mark is scrolling down the page to load 30 more tweets. What would the SQL query look like to fetch the next 30 tweets?

Let us assume that the client keeps track of the scrolling and gives us an offset, 'n', which will contain which "page" the user is on. (start with page 1).
For example, if the user scrolls past 30 tweets, user is offset by a page, and we have to fetch results
for page 2. A query for the same will look like:

*/

SELECT users.username, users.first_name, users.last_name, tweets.text, tweets.published_date
FROM tweets
JOIN users
ON tweets.user_id = users.user_id
AND tweets.published_date < max_tweet_timestamp
WHERE tweets.user_id in (
  SELECT follows_id FROM relationship where relationship.user_id = ( SELECT user_id from users where users.username = 'Mark')
)
ORDER BY published_date DESC
LIMIT 30;

/*
Here rather than passing offset like we did the last time, we need to pass something a little more stable to maintain the timeline
In this case, the timestamp of the tweet. Basically,we keep the first query as is and when we return the results of the query, we store the timestamp of the
last tweet in the response as max_tweet_timestamp. And the following query, we pass it on so that we fetch the tweets only
before that timestamp.
There is one caveat to this approach: what if two tweets are tweeted at the EXACT same time, upto the second? In that case, we can either:
1. Ignore this edge case as it leaves only one duplicate tweet once in a while when this event occurs.
2. Optimize for this condition: by subracting a small amount of time from the timestamp each time before we query to send in max_tweet_timestamp:
  This way we ensure we have a filter (which might not exactly correspond to any tweet time)
*/
	/*
	2) Write the DDL for creating the tables for a MariaDB database with users, tweets and followers.

	A user has a username, password, email, first name and last name.
	A tweet has text, a publication date and is posted by a user.
	A user can be followed by other users.

	*/

	CREATE TABLE users (
	user_id INT AUTO_INCREMENT PRIMARY KEY,
	username varchar(32) UNIQUE,
	first_name varchar(64) NOT NULL,
	last_name varchar(64),
	email varchar(64) NOT NULL UNIQUE,
	password varchar(256) NOT NULL
	);

	CREATE TABLE tweets (
	id INT AUTO_INCREMENT PRIMARY KEY,
	text varchar(280) NOT NULL,
	published_date TIMESTAMP,
	user_id int,
	FOREIGN KEY (user_id) REFERENCES users(user_id) ON DELETE SET NULL
	);


	CREATE TABLE relationship (
	user_id int NOT NULL,
	follows_id int,
	PRIMARY KEY (user_id, follows_id),
	FOREIGN KEY (user_id) REFERENCES users(user_id) ON DELETE CASCADE,
	FOREIGN KEY (follows_id) REFERENCES users(user_id) ON DELETE CASCADE
	);


	/*

	2.1) Write a SQL query that returns the 30 latest tweets by users followed by the user with username "Mark". The result must include username, first name, last name, tweet text and publication date.

	// The following query fetches the required data from the 3 tables, by performing a join and orders the latest 30 tweets
	// by timestamp

	*/

	SELECT users.username, users.first_name, users.last_name, tweets.text, tweets.published_date
	FROM tweets
	JOIN users ON tweets.user_id = users.user_id
	WHERE tweets.user_id in (
	SELECT follows_id FROM relationship where relationship.user_id = ( SELECT user_id from users where users.username = 'Mark')
	)
	ORDER BY published_date DESC
	LIMIT 0,30;

	/*
	2.2) One minute later, Mark is scrolling down the page to load 30 more tweets. What would the SQL query look like to fetch the next 30 tweets?

	Let us assume that the client keeps track of the scrolling and gives us an offset, 'n', which will contain which "page" the user is on. (start with page 1).
	For example, if the user scrolls past 30 tweets, user is offset by a page, and we have to fetch results
	for page 2. A query for the same will look like:

	*/

	SELECT users.username, users.first_name, users.last_name, tweets.text, tweets.published_date
	FROM tweets
	JOIN users
	ON tweets.user_id = users.user_id
	AND tweets.published_date < max_tweet_timestamp
	WHERE tweets.user_id in (
	SELECT follows_id FROM relationship where relationship.user_id = ( SELECT user_id from users where users.username = 'Mark')
	)
	ORDER BY published_date DESC
	LIMIT 30;

	/*
	Here rather than passing offset like we did the last time, we need to pass something a little more stable to maintain the timeline
	In this case, the timestamp of the tweet. Basically,we keep the first query as is and when we return the results of the query, we store the timestamp of the
	last tweet in the response as max_tweet_timestamp. And the following query, we pass it on so that we fetch the tweets only
	before that timestamp.
	There is one caveat to this approach: what if two tweets are tweeted at the EXACT same time, upto the second? In that case, we can either:
	1. Ignore this edge case as it leaves only one duplicate tweet once in a while when this event occurs.
	2. Optimize for this condition: by subracting a small amount of time from the timestamp each time before we query to send in max_tweet_timestamp:
	This way we ensure we have a filter (which might not exactly correspond to any tweet time)
	*/