Skip to content

Instantly share code, notes, and snippets.

@risavkarna
Created September 29, 2012 23:19
Show Gist options
  • Save risavkarna/3805409 to your computer and use it in GitHub Desktop.
Save risavkarna/3805409 to your computer and use it in GitHub Desktop.
SET SCHEMA CHALLENGE3;
DROP TYPE UNKNOWN_TWEET_T;
CREATE TYPE UNKNOWN_TWEET_T AS TABLE
(
"ID" BIGINT NOT NULL, -- Unique identifier for this Tweet
"TEXT" nvarchar(800) NOT NULL, -- The actual text of the status update
"TRUNCATED" TINYINT NOT NULL default 0, -- (1 = True, 0 = False) Indicates whether the text was truncated, for example, as a result of a retweet exceeding the 140 character Tweet length. Truncated text will end in ellipsis, like this ...
"SOURCE" nvarchar(255), -- Utility used to post the Tweet, as an HTML-formatted string. (e.g. 'web')
"CREATED_AT" DATETIME not null, -- UTC time when this Tweet was created.
"IN_REPLY_TO_STATUS_ID" BIGINT, -- If the represented Tweet is a reply, ID of the original Tweet
"IN_REPLY_TO_USER_ID" BIGINT, -- If the represented Tweet is a reply, ID of the original Tweet's author.
--"GEO_LONGITUDE" FLOAT, -- Longitude, Geographic Location
--"GEO_LATITUDE" FLOAT, -- Latitude, Geographics Location
--"ORIGINAL_TWEET_ID" BIGINT, -- if this retweet contains the ID of the original tweet, which is also stored...
-- tweet stats
--"EMBEDDED" TINYINT, -- (1 = True, 0 = False) Indicates whether the tweet was embedded in a retweet
"RETWEET_COUNT" Integer, -- Number of times this Tweet has been retweeted.
-- user
"USER_ID" BIGINT NOT NULL, -- Unique identifier for this User
"USER_NAME" nvarchar(40), -- The name of the user, as they've defined it. Not necessarily a person's name.
"USER_SCREEN_NAME" nvarchar(20), -- The screen name, handle, or alias that this user identifies themselves with. screen_names are unique but subject to change.
"USER_CREATED_AT" DATETIME not null, -- The UTC datetime that the user account was created on Twitter.
"USER_LANG" nvarchar(8), -- The ISO 639-1 two-letter character code for the user's self-declared user interface language.
"USER_LOCATION" nvarchar(250), -- The user-defined location for this account's profile.
"USER_DESCRIPTION" nvarchar(500), -- The user-defined string describing their account.
"USER_PROFILE_IMAGE" nvarchar(500), -- A HTTP-based URL pointing to the user's avatar image.
"USER_TIME_ZONE" nvarchar(80), -- A string describing the Time Zone this user declares themselves within, e.g. "Pacific Time (US & Canada)"
"USER_UTC_OFFSET" integer, -- The offset from GMT/UTC in seconds.
"USER_URL" nvarchar(200), -- A URL provided by the user in association with their profile.
-- user stats
"USER_STATUSES_COUNT" Integer, -- The number of tweets (including retweets) issued by the user.
"USER_FRIENDS_COUNT" Integer, -- The number of users this account is following (AKA their "followings")
"USER_FOLLOWERS_COUNT" Integer, -- The number of followers this account currently has.
"USER_FAVOURITES_COUNT" Integer, -- The number of tweets this user has favorited in the account's lifetime.
"USER_LISTED_COUNT" Integer -- The number of public lists that this user is a member of.
-- place
--"PLACE_ID" nvarchar(255), -- ID representing this place. Note that this is represented as a string, not an integer.
--"PLACE_NAME" nvarchar(255), -- Short human-readable representation of the place's name.
--"PLACE_FULL_NAME" nvarchar(255), -- Full human-readable representation of the place's name.
--"PLACE_TYPE" nvarchar(20), -- The type of location represented by this place. (e.g. 'city')
--"PLACE_COUNTRY" nvarchar(50), -- Name of the country containing this place.
--"PLACE_COUNTRY_CODE" nvarchar(8), -- Shortened country code representing the country containing this place.
--"PLACE_URL" nvarchar(500) -- e.g. "http://api.twitter.com/1/geo/id/6565298bcadb82a1.json"
);
--hashtag code
drop type "TEXTS_T";
create type "TEXTS_T" AS TABLE(
"TWEET_ID" BIGINT, -- Tweet ID (ForeinKey)
"TEXT" nvarchar(800) -- A word from the tweet
);
drop procedure strip_first_words;
create procedure strip_first_words
(
in texts TEXTS_T,
out words TEXTS_T,
out rest TEXTS_T
) language sqlscript as
begin
words = select TWEET_ID, substr_before(TEXT, ' ') as TEXT from :texts;
rest = select TWEET_ID, substr_after(TEXT, ' ') as TEXT from :texts where substr_after(TEXT, ' ') != '';
end;
drop procedure compute_words;
create procedure compute_words
(
in initial_texts TEXTS_T,
out total_words TEXTS_T
) language sqlscript as
remaining_text_count integer;
begin
texts = select * from :initial_texts;
strip_first_words(:texts, :total_words, :remaining_texts);
texts = select * from :remaining_texts;
select count(*) into remaining_text_count from :texts;
while remaining_text_count > 0 do
strip_first_words(:texts, :words, :remaining_texts);
texts = select * from :remaining_texts;
total_words = (select * from :total_words) union (select * from :words);
select count(*) into remaining_text_count from :texts;
end while;
end;
DROP VIEW nbdsaf;
CREATE VIEW nbdsaf AS (SELECT ID AS TWEET_ID, TEXT FROM CHALLENGE3.TWEETS_1DAY_REAL);
--CREATE COLUMN TABLE bla LIKE TEXTs_T;
drop type "HASH_COOR";
create type "HASH_COOR" AS TABLE(
"TWEET_ID" BIGINT, -- Tweet ID (ForeinKey)
"TEXT" nvarchar(250), -- A word from the tweet
"OCCUR" BIGINT,
"PLACE_NAME" nvarchar(255),
"MAX_LONG" double,
"MIN_LONG" double,
"MAX_LAT" double,
"MIN_LAT" double
);
DROP PROCEDURE CHECK_HASHTAGS;
CREATE PROCEDURE CHECK_HASHTAGS (
IN id BIGINT,
IN source UNKNOWN_TWEET_T,
OUT maxlat FLOAT,
OUT minlat FLOAT,
OUT maxlong FLOAT,
OUT minlong FLOAT,
result HASH_COOR
) AS
--
count INTEGER;
BEGIN
tmp = SELECT ID AS TWEET_ID, TEXT FROM :source WHERE ID = :id;
compute_words(:tmp, intermediate);
result = SELECT TOP 1 TWEET_ID, co.* FROM
(SELECT TWEET_ID, TRIM('#' FROM TEXT) AS TEXT
FROM :intermediate
WHERE TEXT LIKE '#%') tw, CHALLENGE3.HASHTAGS_COORDS co
WHERE tw.TEXT = co.TEXT;
SELECT COUNT(*) INTO count FROM result;
IF count>0 THEN
SELECT MAX_LAT INTO maxlat FROM :result;
SELECT MAX_LONG INTO maxlong FROM :result;
SELECT MIN_LONG INTO minlong FROM :result;
SELECT MIN_LAT INTO minlat FROM :result;
ELSE
maxlat := 90;
minlat := -90;
maxlong := 180;
minlong := -180;
END IF;
END;
--hastag code ends
--GIVEN A TWITTER RETURN ITS LATITUDE AND LONGITUDE BASED ON UTC-TIME ZONE ONLY FOR NOW
DROP TYPE GEO_COORDINATES_T;
CREATE TYPE GEO_COORDINATES_T AS TABLE
(
"TWEET_ID" BIGINT NOT NULL, -- Tweet ID (ForeinKey)
"GEO_LONGITUDE" FLOAT, -- Longitude, Geographic Location
"GEO_LATITUDE" FLOAT -- Latitude, Geographics Location
);
DROP PROCEDURE EXISTING_USER;
CREATE PROCEDURE EXISTING_USER(
IN userID BIGINT,
OUT maxlat DOUBLE,
OUT minlat DOUBLE,
OUT maxlong DOUBLE,
OUT minlong DOUBLE
)
AS
RESULTCOUNT INTEGER;
BEGIN
topUsedLoc = SELECT TOP 1
USER_ID,
MAX_LAT,
MIN_LAT,
MAX_LONG,
MIN_LONG
FROM TOP_USR_LOC
WHERE USER_ID = :userID;
SELECT COUNT(*) INTO RESULTCOUNT FROM :topUsedLoc;
IF RESULTCOUNT > 0 THEN
SELECT MAX_LAT INTO maxlat FROM :topUsedLoc;
SELECT MAX_LONG INTO maxlong FROM :topUsedLoc;
SELECT MIN_LONG INTO minlong FROM :topUsedLoc;
SELECT MIN_LAT INTO minlat FROM :topUsedLoc;
ELSE
maxlat := 90;
minlat := -90;
maxlong := 180;
minlong := -180;
END IF;
END;
DROP PROCEDURE COORDINATE_RESOLVER;
CREATE PROCEDURE COORDINATE_RESOLVER(
IN minlat FLOAT,
IN maxlat FLOAT,
IN minlon FLOAT,
IN maxlon FLOAT,
OUT lat FLOAT,
OUT lon FLOAT
)
AS BEGIN
lat := maxlat-minlat;
lon := maxlon-minlon;
END;
DROP PROCEDURE UTC_OFFSET;
CREATE PROCEDURE UTC_OFFSET(
IN utcoffset INTEGER,
OUT max_long FLOAT,
OUT min_long FLOAT
)
AS
RESULTCOUNT INTEGER;
BEGIN
mean_long = (SELECT TOP 1 MIN_LONG
FROM UTCOFFSET_MINLONG
WHERE UTCOFFSET*3600 = COALESCE(utcoffset,1));
SELECT COUNT(*) INTO RESULTCOUNT FROM :mean_long;
IF RESULTCOUNT > 0 THEN
SELECT MIN_LONG+7.5 INTO max_long FROM :mean_long;
SELECT MIN_LONG-7.5 INTO min_long FROM :mean_long;
ELSE
max_long := 180.0;
min_long := -180.0;
END IF;
END;
DROP PROCEDURE ESTIMATE_COORDINATES;
CREATE PROCEDURE ESTIMATE_COORDINATES(
IN source UNKNOWN_TWEET_T,
OUT result GEO_COORDINATES_T
)
AS
v_max_lat FLOAT;
v_max_long FLOAT;
v_min_lat FLOAT;
v_min_long FLOAT;
v_lat FLOAT;
v_long FLOAT;
CURSOR c_tweets FOR SELECT * FROM :source;
BEGIN
--call something for every element in the cursor
FOR tweet as c_tweets DO
--initialize
v_max_lat := 90;
v_min_lat := -90;
v_max_long := 180;
v_min_long := -180;
--start from the worst guess
IF tweet.USER_UTC_OFFSET IS NOT NULL THEN
UTC_OFFSET(tweet.USER_UTC_OFFSET, v_max_long, v_min_long);
ELSE
CHECK_HASHTAGS(tweet.ID, :source, v_max_lat, v_min_lat, v_max_long, v_min_long, res);
v_max_long := 180;
v_min_long := -180;
END IF;
EXISTING_USER(tweet.USER_ID, v_max_lat, v_min_lat, v_max_long, v_min_long);
COORDINATE_RESOLVER(v_min_lat, v_max_lat, v_min_long, v_max_long, v_lat, v_long);
result = SELECT tweet.ID AS TWEET_ID, v_long AS GEO_LONGITUDE, v_lat AS GEO_LATITUDE FROM dummy;
INSERT INTO TEST1 VALUES (tweet.ID, :v_long, :v_lat);
END FOR;
END;
DROP TABLE TEST69;
CREATE TABLE TEST1 LIKE GEO_COORDINATES_T;
CALL ESTIMATE_COORDINATES(TWEETS_1DAY_REAL, ?);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment