Created
September 29, 2012 23:19
-
-
Save risavkarna/3805409 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SET SCHEMA CHALLENGE3; | |
DROP TYPE UNKNOWN_TWEET_T; | |
CREATE TYPE UNKNOWN_TWEET_T AS TABLE | |
( | |
"ID" BIGINT NOT NULL, -- Unique identifier for this Tweet | |
"TEXT" nvarchar(800) NOT NULL, -- The actual text of the status update | |
"TRUNCATED" TINYINT NOT NULL default 0, -- (1 = True, 0 = False) Indicates whether the text was truncated, for example, as a result of a retweet exceeding the 140 character Tweet length. Truncated text will end in ellipsis, like this ... | |
"SOURCE" nvarchar(255), -- Utility used to post the Tweet, as an HTML-formatted string. (e.g. 'web') | |
"CREATED_AT" DATETIME not null, -- UTC time when this Tweet was created. | |
"IN_REPLY_TO_STATUS_ID" BIGINT, -- If the represented Tweet is a reply, ID of the original Tweet | |
"IN_REPLY_TO_USER_ID" BIGINT, -- If the represented Tweet is a reply, ID of the original Tweet's author. | |
--"GEO_LONGITUDE" FLOAT, -- Longitude, Geographic Location | |
--"GEO_LATITUDE" FLOAT, -- Latitude, Geographics Location | |
--"ORIGINAL_TWEET_ID" BIGINT, -- if this retweet contains the ID of the original tweet, which is also stored... | |
-- tweet stats | |
--"EMBEDDED" TINYINT, -- (1 = True, 0 = False) Indicates whether the tweet was embedded in a retweet | |
"RETWEET_COUNT" Integer, -- Number of times this Tweet has been retweeted. | |
-- user | |
"USER_ID" BIGINT NOT NULL, -- Unique identifier for this User | |
"USER_NAME" nvarchar(40), -- The name of the user, as they've defined it. Not necessarily a person's name. | |
"USER_SCREEN_NAME" nvarchar(20), -- The screen name, handle, or alias that this user identifies themselves with. screen_names are unique but subject to change. | |
"USER_CREATED_AT" DATETIME not null, -- The UTC datetime that the user account was created on Twitter. | |
"USER_LANG" nvarchar(8), -- The ISO 639-1 two-letter character code for the user's self-declared user interface language. | |
"USER_LOCATION" nvarchar(250), -- The user-defined location for this account's profile. | |
"USER_DESCRIPTION" nvarchar(500), -- The user-defined string describing their account. | |
"USER_PROFILE_IMAGE" nvarchar(500), -- A HTTP-based URL pointing to the user's avatar image. | |
"USER_TIME_ZONE" nvarchar(80), -- A string describing the Time Zone this user declares themselves within, e.g. "Pacific Time (US & Canada)" | |
"USER_UTC_OFFSET" integer, -- The offset from GMT/UTC in seconds. | |
"USER_URL" nvarchar(200), -- A URL provided by the user in association with their profile. | |
-- user stats | |
"USER_STATUSES_COUNT" Integer, -- The number of tweets (including retweets) issued by the user. | |
"USER_FRIENDS_COUNT" Integer, -- The number of users this account is following (AKA their "followings") | |
"USER_FOLLOWERS_COUNT" Integer, -- The number of followers this account currently has. | |
"USER_FAVOURITES_COUNT" Integer, -- The number of tweets this user has favorited in the account's lifetime. | |
"USER_LISTED_COUNT" Integer -- The number of public lists that this user is a member of. | |
-- place | |
--"PLACE_ID" nvarchar(255), -- ID representing this place. Note that this is represented as a string, not an integer. | |
--"PLACE_NAME" nvarchar(255), -- Short human-readable representation of the place's name. | |
--"PLACE_FULL_NAME" nvarchar(255), -- Full human-readable representation of the place's name. | |
--"PLACE_TYPE" nvarchar(20), -- The type of location represented by this place. (e.g. 'city') | |
--"PLACE_COUNTRY" nvarchar(50), -- Name of the country containing this place. | |
--"PLACE_COUNTRY_CODE" nvarchar(8), -- Shortened country code representing the country containing this place. | |
--"PLACE_URL" nvarchar(500) -- e.g. "http://api.twitter.com/1/geo/id/6565298bcadb82a1.json" | |
); | |
--hashtag code | |
drop type "TEXTS_T"; | |
create type "TEXTS_T" AS TABLE( | |
"TWEET_ID" BIGINT, -- Tweet ID (ForeinKey) | |
"TEXT" nvarchar(800) -- A word from the tweet | |
); | |
drop procedure strip_first_words; | |
create procedure strip_first_words | |
( | |
in texts TEXTS_T, | |
out words TEXTS_T, | |
out rest TEXTS_T | |
) language sqlscript as | |
begin | |
words = select TWEET_ID, substr_before(TEXT, ' ') as TEXT from :texts; | |
rest = select TWEET_ID, substr_after(TEXT, ' ') as TEXT from :texts where substr_after(TEXT, ' ') != ''; | |
end; | |
drop procedure compute_words; | |
create procedure compute_words | |
( | |
in initial_texts TEXTS_T, | |
out total_words TEXTS_T | |
) language sqlscript as | |
remaining_text_count integer; | |
begin | |
texts = select * from :initial_texts; | |
strip_first_words(:texts, :total_words, :remaining_texts); | |
texts = select * from :remaining_texts; | |
select count(*) into remaining_text_count from :texts; | |
while remaining_text_count > 0 do | |
strip_first_words(:texts, :words, :remaining_texts); | |
texts = select * from :remaining_texts; | |
total_words = (select * from :total_words) union (select * from :words); | |
select count(*) into remaining_text_count from :texts; | |
end while; | |
end; | |
DROP VIEW nbdsaf; | |
CREATE VIEW nbdsaf AS (SELECT ID AS TWEET_ID, TEXT FROM CHALLENGE3.TWEETS_1DAY_REAL); | |
--CREATE COLUMN TABLE bla LIKE TEXTs_T; | |
drop type "HASH_COOR"; | |
create type "HASH_COOR" AS TABLE( | |
"TWEET_ID" BIGINT, -- Tweet ID (ForeinKey) | |
"TEXT" nvarchar(250), -- A word from the tweet | |
"OCCUR" BIGINT, | |
"PLACE_NAME" nvarchar(255), | |
"MAX_LONG" double, | |
"MIN_LONG" double, | |
"MAX_LAT" double, | |
"MIN_LAT" double | |
); | |
DROP PROCEDURE CHECK_HASHTAGS; | |
CREATE PROCEDURE CHECK_HASHTAGS ( | |
IN id BIGINT, | |
IN source UNKNOWN_TWEET_T, | |
OUT maxlat FLOAT, | |
OUT minlat FLOAT, | |
OUT maxlong FLOAT, | |
OUT minlong FLOAT, | |
result HASH_COOR | |
) AS | |
-- | |
count INTEGER; | |
BEGIN | |
tmp = SELECT ID AS TWEET_ID, TEXT FROM :source WHERE ID = :id; | |
compute_words(:tmp, intermediate); | |
result = SELECT TOP 1 TWEET_ID, co.* FROM | |
(SELECT TWEET_ID, TRIM('#' FROM TEXT) AS TEXT | |
FROM :intermediate | |
WHERE TEXT LIKE '#%') tw, CHALLENGE3.HASHTAGS_COORDS co | |
WHERE tw.TEXT = co.TEXT; | |
SELECT COUNT(*) INTO count FROM result; | |
IF count>0 THEN | |
SELECT MAX_LAT INTO maxlat FROM :result; | |
SELECT MAX_LONG INTO maxlong FROM :result; | |
SELECT MIN_LONG INTO minlong FROM :result; | |
SELECT MIN_LAT INTO minlat FROM :result; | |
ELSE | |
maxlat := 90; | |
minlat := -90; | |
maxlong := 180; | |
minlong := -180; | |
END IF; | |
END; | |
--hastag code ends | |
--GIVEN A TWITTER RETURN ITS LATITUDE AND LONGITUDE BASED ON UTC-TIME ZONE ONLY FOR NOW | |
DROP TYPE GEO_COORDINATES_T; | |
CREATE TYPE GEO_COORDINATES_T AS TABLE | |
( | |
"TWEET_ID" BIGINT NOT NULL, -- Tweet ID (ForeinKey) | |
"GEO_LONGITUDE" FLOAT, -- Longitude, Geographic Location | |
"GEO_LATITUDE" FLOAT -- Latitude, Geographics Location | |
); | |
DROP PROCEDURE EXISTING_USER; | |
CREATE PROCEDURE EXISTING_USER( | |
IN userID BIGINT, | |
OUT maxlat DOUBLE, | |
OUT minlat DOUBLE, | |
OUT maxlong DOUBLE, | |
OUT minlong DOUBLE | |
) | |
AS | |
RESULTCOUNT INTEGER; | |
BEGIN | |
topUsedLoc = SELECT TOP 1 | |
USER_ID, | |
MAX_LAT, | |
MIN_LAT, | |
MAX_LONG, | |
MIN_LONG | |
FROM TOP_USR_LOC | |
WHERE USER_ID = :userID; | |
SELECT COUNT(*) INTO RESULTCOUNT FROM :topUsedLoc; | |
IF RESULTCOUNT > 0 THEN | |
SELECT MAX_LAT INTO maxlat FROM :topUsedLoc; | |
SELECT MAX_LONG INTO maxlong FROM :topUsedLoc; | |
SELECT MIN_LONG INTO minlong FROM :topUsedLoc; | |
SELECT MIN_LAT INTO minlat FROM :topUsedLoc; | |
ELSE | |
maxlat := 90; | |
minlat := -90; | |
maxlong := 180; | |
minlong := -180; | |
END IF; | |
END; | |
DROP PROCEDURE COORDINATE_RESOLVER; | |
CREATE PROCEDURE COORDINATE_RESOLVER( | |
IN minlat FLOAT, | |
IN maxlat FLOAT, | |
IN minlon FLOAT, | |
IN maxlon FLOAT, | |
OUT lat FLOAT, | |
OUT lon FLOAT | |
) | |
AS BEGIN | |
lat := maxlat-minlat; | |
lon := maxlon-minlon; | |
END; | |
DROP PROCEDURE UTC_OFFSET; | |
CREATE PROCEDURE UTC_OFFSET( | |
IN utcoffset INTEGER, | |
OUT max_long FLOAT, | |
OUT min_long FLOAT | |
) | |
AS | |
RESULTCOUNT INTEGER; | |
BEGIN | |
mean_long = (SELECT TOP 1 MIN_LONG | |
FROM UTCOFFSET_MINLONG | |
WHERE UTCOFFSET*3600 = COALESCE(utcoffset,1)); | |
SELECT COUNT(*) INTO RESULTCOUNT FROM :mean_long; | |
IF RESULTCOUNT > 0 THEN | |
SELECT MIN_LONG+7.5 INTO max_long FROM :mean_long; | |
SELECT MIN_LONG-7.5 INTO min_long FROM :mean_long; | |
ELSE | |
max_long := 180.0; | |
min_long := -180.0; | |
END IF; | |
END; | |
DROP PROCEDURE ESTIMATE_COORDINATES; | |
CREATE PROCEDURE ESTIMATE_COORDINATES( | |
IN source UNKNOWN_TWEET_T, | |
OUT result GEO_COORDINATES_T | |
) | |
AS | |
v_max_lat FLOAT; | |
v_max_long FLOAT; | |
v_min_lat FLOAT; | |
v_min_long FLOAT; | |
v_lat FLOAT; | |
v_long FLOAT; | |
CURSOR c_tweets FOR SELECT * FROM :source; | |
BEGIN | |
--call something for every element in the cursor | |
FOR tweet as c_tweets DO | |
--initialize | |
v_max_lat := 90; | |
v_min_lat := -90; | |
v_max_long := 180; | |
v_min_long := -180; | |
--start from the worst guess | |
IF tweet.USER_UTC_OFFSET IS NOT NULL THEN | |
UTC_OFFSET(tweet.USER_UTC_OFFSET, v_max_long, v_min_long); | |
ELSE | |
CHECK_HASHTAGS(tweet.ID, :source, v_max_lat, v_min_lat, v_max_long, v_min_long, res); | |
v_max_long := 180; | |
v_min_long := -180; | |
END IF; | |
EXISTING_USER(tweet.USER_ID, v_max_lat, v_min_lat, v_max_long, v_min_long); | |
COORDINATE_RESOLVER(v_min_lat, v_max_lat, v_min_long, v_max_long, v_lat, v_long); | |
result = SELECT tweet.ID AS TWEET_ID, v_long AS GEO_LONGITUDE, v_lat AS GEO_LATITUDE FROM dummy; | |
INSERT INTO TEST1 VALUES (tweet.ID, :v_long, :v_lat); | |
END FOR; | |
END; | |
DROP TABLE TEST69; | |
CREATE TABLE TEST1 LIKE GEO_COORDINATES_T; | |
CALL ESTIMATE_COORDINATES(TWEETS_1DAY_REAL, ?); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment