Skip to content

Instantly share code, notes, and snippets.

@andyed
Created November 20, 2010 19:58
Show Gist options
  • Save andyed/708103 to your computer and use it in GitHub Desktop.
Save andyed/708103 to your computer and use it in GitHub Desktop.
code for processing & working with testpilot tab dataset 1
CREATE TABLE vanilla30
(user int, event_code int, tab_position int, tab_window int, ui_method int, tab_site_hash int, num_tabs int, timestamp bigint)
LOAD DATA LOCAL INFILE '~/vanilla_firefox_users_30_percent.csv'
INTO TABLE vanilla30
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
(user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp);
install.packages("RMySQL",,"http://r.research.att.com/";)
CREATE TABLE vanilla30seq
( id MEDIUMINT NOT NULL AUTO_INCREMENT,user int, event_code int, tab_position int, tab_window int, ui_method int, tab_site_hash int, num_tabs int, timestamp bigint, PRIMARY KEY (id))
insert into vanilla30seq (user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp)
select user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp from vanilla30 order by user, tab_window, timestamp
All tab opens
--------------
select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_window = t2.tab_window and t1.event_code = 1 and t2.event_code <>-1
select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset
from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window and t1.event_code = 2 and (t2.event_code =1 or t2.event_code = 5) group by t1.timestamp - t2.timestamp
select count(*) as N, t1.timestamp - t2.timestamp as delta, t3.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2, vanilla30seq t3 where t1.id = t2.id + 1 and t1.id = t3.id + 2 and t1.user = t2.user and t3.user = t1.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window and t1.tab_window = t3.tab_window and (t1.event_code = 2) and (t2.event_code =1 or t2.event_code = 5)
select avg(num_tabs) as avgTabs, user, floor(timestamp / 1000 / 60 / 60 ) as day, max(num_tabs) as maxTabs, max(tab_window) as maxWindows from vanilla30seq where timestamp <> 0 and  floor(timestamp / 1000 / 60 / 60 ) < 348001  group by user,  floor(timestamp / 1000 / 60 / 60 )
select v.*, u.*, c.* from vanilla30 v, vanilla30user u, vanilla30userclose c where c.user = u.user and u.user = v.user
select u.*, c.* from, vanilla30user u, vanilla30userclose c where c.user = u.user
select user, count(*) as N,
max(avg_tabs) as maxTabs,
'window' as type,
'close' as action
from treestyle
where event_code = 10
group by user
union
select user, count(*) as N,
'window' as type,
'open' as action
from treestyle
where event_code = 9
group by user
union
select user, count(*) as N,
'tab' as type,
'open' as action
from treestyle
where event_code = 1
group by user
union
select user, count(*) as N,
'tab' as type,
'close' as action
from treestyle
where event_code = 2
group by user
select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset,
t1.event_code as event2, t2.event_code as event1
from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window
group by t1.timestamp - t2.timestamp, t1.event_code, t2.event_code
create table vanilla30markov ( N int, delta bigint, tabChangeOffset int, event2 int, event1 int)
insert into vanilla30markov (N, delta, tabchangeoffset, event2, event1)
select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset,
t1.event_code as event2, t2.event_code as event1
from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window
group by t1.timestamp - t2.timestamp, t1.event_code, t2.event_code
create table vanilla30markovsimple ( N int, delta bigint, tabChangeOffset int, event2 int, event1 int)
insert into vanilla30markovsimple (N, delta, tabchangeoffset, event2, event1) select count(*) as N, t1.timestamp - t2.timestamp as delta, t1.event_code as event2, t2.event_code as event1 from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window group by t1.event_code, t2.event_code;
CREATE TABLE treestyleseq
( id MEDIUMINT NOT NULL AUTO_INCREMENT,user int, event_code int, tab_position int, tab_window int, ui_method int, tab_site_hash int, num_tabs int, timestamp bigint, PRIMARY KEY (id))
insert into treestyleseq (user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp)
select user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp from treestyle order by user, tab_window, timestamp
create table treestylemarkovsimple ( N int, delta bigint, tabChangeOffset int, event2 int, event1 int)
insert into treestylemarkovsimple (N, delta, tabchangeoffset, event2, event1)
select count(*) as N, t1.timestamp - t2.timestamp as delta, 0 as tabchangeoffset, t1.event_code as event2, t2.event_code as event1 from treestyleseq t1, treestyleseq t2
where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window group by t1.event_code, t2.event_code;
create table vanilla30seq3 (
N int, delta1 bigint, delta2 bigint, tabChangeOffset int, event3 int, event2 int, event1 int
)
insert into vanilla30seq3 (N, delta1 , delta2 , event3 , event2 , event1 )
values select count(*) as N, t1.timestamp - t2.timestamp as delta1,
t2.timestamp - t3.timestamp as delta2,
t1.event_code, t2.event_code, t3.event_code
from vanilla30seq t1, vanilla30seq t2, vanilla30seq t3 where t1.id = t2.id + 1 and t1.id = t3.id + 2 and t1.user = t2.user and t3.user = t1.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window and t1.tab_window = t3.tab_window group by
t1.event_code, t2.event_code, t3.event_code
(max(timestamp) - min(timestamp)) / 1000 / 60 / 60
> 24
and (max(timestamp) - min(timestamp)) / 1000 / 60 / 60
< 1000
--- Tab distances
select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_window = t2.tab_window and ( t2.tab_position - t1.tab_position) > 0 group by ( t2.tab_position - t1.tab_position)
--User Summary
-- span, # sessions, mean, median max
select count(*) as N, avg(max(timestamp) - min(t2.timestamp)) as avgSessionDuration, user
from vanilla30seq t1
group by session, user
create table vanilla30usersession (user int, session int, maxTabs int, maxWindows int, avgTabs int, duration bigint, sessionHosts int, sessionPvs int)
insert into vanilla30usersession (user, maxtabs, maxwindows, sessions, avgTabs, duration, sessionHosts, sessionPvs)
select user, session, max(num_tabs), max(tab_window) , avg(num_tabs) as avgTabs, max(timestamp) - min(timestamp) as duration, count(distinct tab_site_hash) as hosts from vanilla30seq group by user, session
-- USER SEGMENTS
select avg(num_tabs) as avgTabs, user, floor(timestamp / 1000 / 60 / 60 ) as day, max(num_tabs) as maxTabs, max(tab_window) as maxWindows,
count(distinct session) as sessions, max(timestamp) - min(timestamp) as TimeActive
from vanilla30seq where timestamp <> 0 and  floor(timestamp / 1000 / 60 / 60 ) < 348001  group by user,  floor(timestamp / 1000 / 60 / 60 )
select avg(num_tabs) as avgTabs, user, floor(timestamp / 1000 / 60 / 60 ) as day, max(num_tabs) as maxTabs, max(tab_window) as maxWindows,
count(distinct session) as sessions, max(timestamp) - min(timestamp) as TimeActive
from vanilla30seq where timestamp <> 0 and  floor(timestamp / 1000 / 60 / 60 ) < 348001 and max(timestamp) - min(timestamp) > 0
group by user,  floor(timestamp / 1000 / 60 / 60 )
create table vanilla30userday
select user, avg(avgTabs) as avgTabs, count(distinct session) as sessions, sum(maxWindows) as totalWindows, sum(maxTabs) as totalTabs, sum(timeActive) as totalTime
from vanilla30userday
select user, avg(avgTabs) as avgTabs, count(sessions) as sessions, sum(maxWindows)-count(sessions) as totalWindows, sum(maxTabs) as totalTabs, sum(timeActive) as totalTime,
sum(maxTabs) / count(sessions) as TabsPerSession, max(maxTabs) as peakTabs
from vanilla30userday
group by user
--- MaxTabs > 3*AvgTabs seems to be addict vs binger
peakTabs
n missing unique Mean .05 .10 .25 .50 .75 .90 .95
805 0 62 16.37 4.0 5.0 7.0 12.0 18.0 31.0 39.8
lowest : 2 3 4 5 6, highest: 159 163 210 224 249
avgTabs
n missing unique Mean .05 .10 .25 .50 .75 .90 .95
805 0 803 5.13 1.320 1.538 2.204 3.417 5.637 10.150 14.053
lowest : 0.8994 0.9503 0.9707 1.0449 1.0459, highest: 33.4328 33.6345 41.4546 56.4646 114.8271
- ------
 # consolidated round 2
 places <- read.csv("/Users/andyedmonds/Documents/places.csv")
 places$oldest_stamp = as.POSIXct(strptime(as.character(places$visit_date_oldest),format="%m/%d/%y %H:%M"))
 places$newest_stamp = as.POSIXct(strptime(as.characlter(places$visit_date_newest),format="%m/%d/%y %H:%M"))
 places$time_delta = (places$newest_stamp - places$oldest_stamp) 
 places$time_delta = difftime(places$newest_stamp,places$oldest_stamp, units="days")
  
 places$pages_per_day = places$moz_historyvisits_cnt / as.numeric(places$time_delta)
 places$bookmark_tagged_pct = (places$bookmark_cnt - places$bookmark_nontag_cnt )/ places$bookmark_cnt
 places$percent_visits_new = places$places_visited_unique_cnt / places$moz_historyvisits_cnt
 places$pages_per_day_last90 = places$frecency_fourth_bucket_visit_cnt / 90
 places$pages_per_day_last31 = places$frecency_third_bucket_visit_cnt / 31
 places$pages_per_day_last4 = places$frecency_first_bucket_visit_cnt / 4
 places$pages_per_day_last14 = places$frecency_second_bucket_visit_cnt / 14
 
 places$upages_per_day_last14 = places$frecency_second_bucket_place_unique_cnt / 14
 places$upages_per_day_last4 = places$frecency_first_bucket_place_unique_cnt / 4
 places$upages_per_day_last31 = places$frecency_third_bucket_place_unique_cnt / 31
 places$upages_per_day_last90 = places$frecency_fourth_bucket_place_unique_cnt / 90
 places$user_of_bookmarks  <- ifelse(places$bookmark_cnt > 30,  c("1"), c("0"))
 places$user_of_livemarks  <- ifelse(places$livemark_container_cnt > 1, c("1"), c("0"))
 places$user_of_tags  <- ifelse(places$tag_cnt > 0, c("1"), c("0"))
 
 
 
 # round 1 - exploratory
 
 places <- read.csv("/Users/andyedmonds/Documents/places.csv")
 places <- read.csv("/Users/andyedmonds/Documents/places.csv")
 places$oldest_stamp = as.POSIXct(strptime(as.character(places$visit_date_oldest),format="%m/%d/%y %H:%M"))
 places$newest_stamp = as.POSIXct(strptime(as.character(places$visit_date_newest),format="%m/%d/%y %H:%M"))
 places$time_delta = (places$newest_stamp - places$oldest_stamp) 
 places$time_delta = difftime(places$newest_stamp,places$oldest_stamp, units="days")
 
 places$bookmark_tagged_pct = (places$bookmark_cnt - places$bookmark_nontag_cnt )/ places$bookmark_tagged_pct = (places$bookmark_cnt - places$bookmark_nontag_cnt )/ places$bookmark_cnt
places$pages_per_day = places$moz_historyvisits_cnt / as.numeric(places$time_delta)
 
 places$folder_cnt_crrctd = places$folder_cnt - places$bookmark_cnt
 
places$percent_visits_new = places$places_visited_unique_cnt / places$moz_historyvisits_cnt
taggers <- places[places$tag_cnt > 0,]
livemarkers = places[places$livemark_container_cnt > 0,]
> places$user_of_tags  <- ifelse(places$tag_cnt > 0, c("1"), c("0"))
> places$user_of_tags  <- ifelse(places$tag_cnt > 0, c("1"), c("0"))
> places$user_of_bookmarks  <- ifelse(places$bookmark_cnt > 30, c("1"), c("0"))
> places$user_of_livemarks  <- ifelse(places$livemark_container_cnt > 1, c("1"), c("0"))
# Remove the two users over 40k bookmarks
places_trim = subset(places, bookmark_cnt < 40000)
# Decision Tree clustering
library(rpart)
fit <- rpart(places_trim$bookmark_cnt ~  places_trim$moz_historyvisits_cnt +  places_trim$places_visited_unique_cnt + places_trim$time_delta + places_trim$folder_cnt_crrctd + places_trim$tag_cnt + places_trim$visits_per_place_avg )
post(fit, "/Users/andyedmonds/Documents/fit.ps", title="Bookmark Fit")
--------
--- Summary at http://surfmind.com/muzings/?p=505 ---
-- aset is pretty large, with 7749 poi
MySQL Create
-------------------
CREATE TABLE vanilla30
(user int, event_code int, tab_position int, tab_window int, ui_method int, tab_site_hash int, num_tabs int, timestamp bigint)
LOAD DATA LOCAL INFILE '~/vanilla_firefox_users_30_percent.csv'
INTO TABLE vanilla30 
FIELDS TERMINATED BY ',' 
LINES TERMINATED BY '\n' 
(user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp);
CREATE TABLE treestyle
(user int, event_code int, tab_position int, tab_window int, ui_method int, tab_site_hash int, num_tabs int, timestamp bigint)
LOAD DATA LOCAL INFILE '~/vtree_style_tab_users_70_percent.csv'
INTO TABLE t re
FIELDS TERMINATED BY ',' 
LINES TERMINATED BY '\n' 
(user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp);
R Datafile
----------------
> tabs <- read.csv(".../Downloads/windows_users_30_percent.csv")
> summary(tabs)
R Database 
------------------
> library(RMySQL)
Loading required package: DBI
> con <- dbConnect(MySQL(), user="root",  dbname="testpilot", host = "localhost")
> rs <- dbSendQuery(con,"select avg(num_tabs) as avgTabs, user,  max(num_tabs) as maxTabs from vanilla30 group by user")
> userTabs <- fetch(rs) 
> summary(userTabs)
    avgTabs              user          maxTabs      
 Min.   :  0.9102   Min.   :  0.0   Min.   :  2.00  
 1st Qu.:  2.7464   1st Qu.:124.8   1st Qu.:  8.00  
 Median :  4.1059   Median :249.5   Median : 13.00  
 Mean   :  6.0415   Mean   :249.5   Mean   : 19.08  
 3rd Qu.:  6.7097   3rd Qu.:374.2   3rd Qu.: 22.00  
 Max.   :178.6778   Max.   :499.0   Max.   :352.00  
> hist(userTabs$avgTabs)
> boxplot(userTabs$avgTabs)
R Sampling Users by Day
----------------------------------
>  con <- dbConnect(MySQL(), user="root",  dbname="testpilot", host = "localhost")
>  rs <- dbSendQuery(con,"select avg(num_tabs) as avgTabs, user, floor(timestamp / 1000 / 60 / 60 / 24) as day, max(num_tabs) as maxTabs from vanilla30 where timestamp <> 0 and  floor(timestamp / 1000 / 60 / 60 / 24) < 14533 group by user,  floor(timestamp / 1000 / 60 / 60 / 24)")
> userTabs <- fetch(rs,100000)
> describe(userTabs)
userTabs 
 4  Variables      7746  Observations
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
avgTabs 
      n missing  unique    Mean     .05     .10     .25     .50     .75     .90     .95 
   7746       0    6767   5.595   1.151   1.429   2.179   3.575   5.993  10.364  15.742 
lowest :   0.0000   0.5000   0.5714   0.6000   0.6250, highest: 179.9956 181.1964 191.1681 200.4904 204.6459 
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
user 
      n missing  unique    Mean     .05     .10     .25     .50     .75     .90     .95 
   7746       0     939   464.4   43.25   92.00  234.00  461.50  696.00  837.00  890.00 
lowest :   0   1   2   3   4, highest: 934 935 936 937 938 
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
day 
      n missing  unique    Mean     .05     .10     .25     .50     .75     .90     .95 
   7746       0      22   14499   14492   14493   14496   14499   14503   14505   14508 
lowest : 14490 14491 14492 14493 14494, highest: 14507 14508 14509 14510 14511 
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
maxTabs 
      n missing  unique    Mean     .05     .10     .25     .50     .75     .90     .95 
   7746       0     110   10.92       2       3       4       8      13      20      29 
lowest :   0   1   2   3   4, highest: 235 237 238 249 352 
R Sampling Users by Hour
------------------------------------
>  rs <- dbSendQuery(con,"select avg(num_tabs) as avgTabs, user, floor(timestamp / 1000 / 60 / 60 ) as day, max(num_tabs) as maxTabs from vanilla30 where timestamp <> 0 and  floor(timestamp / 1000 / 60 / 60 ) < 348001  group by user,  floor(timestamp / 1000 / 60 / 60 )")
> userTabs <- fetch(rs,25000)
> summary(userTabs)
    avgTabs             user            day            maxTabs       
 Min.   :  0.000   Min.   :  0.0   Min.   :347760   Min.   :  0.000  
 1st Qu.:  1.833   1st Qu.:188.0   1st Qu.:347870   1st Qu.:  3.000  
 Median :  3.116   Median :375.0   Median :347915   Median :  5.000  
 Mean   :  5.398   Mean   :368.3   Mean   :347908   Mean   :  7.346  
 3rd Qu.:  5.949   3rd Qu.:546.0   3rd Qu.:347949   3rd Qu.:  9.000  
 Max.   :209.449   Max.   :720.0   Max.   :348000   Max.   :249.000  
> describe(userTabs)
userTabs 
 4  Variables      25000  Observations
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
avgTabs 
      n missing  unique    Mean     .05     .10     .25     .50     .75     .90     .95 
  25000       0   12456   5.398  0.8571  1.0000  1.8333  3.1159  5.9488 10.9396 16.6456 
lowest :   0.0000   0.1429   0.1818   0.2222   0.2500, highest: 180.2347 185.4545 195.6218 199.5410 209.4493 
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
user 
      n missing  unique    Mean     .05     .10     .25     .50     .75     .90     .95 
  25000       0     625   368.3      40      86     188     375     546     651     681 
lowest :   0   1   2   3   4, highest: 716 717 718 719 720 
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
day 
      n missing  unique    Mean     .05     .10     .25     .50     .75     .90     .95 
  25000       0     227  347908  347817  347830  347870  347915  347949  347973  347988 
lowest : 347760 347761 347764 347774 347775, highest: 347996 347997 347998 347999 348000 
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
maxTabs 
      n missing  unique    Mean     .05     .10     .25     .50     .75     .90     .95 
  25000       0     139   7.346       1       1       3       5       9      15      21 
lowest :   0   1   2   3   4, highest: 204 208 210 224 249 
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Including Windows 
----------------------------
>  con <- dbConnect(MySQL(), user="root",  dbname="testpilot", host = "localhost")
>  rs <- dbSendQuery(con,"select avg(num_tabs) as avgTabs, user, max(tab_window) as windows, floor(timestamp / 1000 / 60 / 60 / 24) as day, max(num_tabs) as maxTabs from vanilla30 where timestamp <> 0 and  floor(timestamp / 1000 / 60 / 60 / 24) < 14533 group by user,  floor(timestamp / 1000 / 60 / 60 / 24)")
> userTabs <- fetch(rs,25000)
Crafting a table for self-joins and timing
----------------------------------------------------
insert into vanilla30seq (user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp)
    -> select user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp from vanilla30 order by  user, tab_window, timestamp
    
    
    select t1.*, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position, t2.ui_method as prev_ui_method  from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.tab_window = t2.tab_window l
Tab Switch Distance
-----------------------------
>  con <- dbConnect(MySQL(), user="root",  dbname="testpilot", host = "localhost")
> rs <- dbSendQuery(con,"select t1.*, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset, t2.event_code as prev_event_code  from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.tab_window = t2.tab_window and t1.event_code=5 ")
> userTabs <- fetch(rs,25000)
> summary(userTabs)
       id             user         event_code  tab_position      tab_window       ui_method     tab_site_hash    num_tabs        timestamp             delta           tabChangeOffset     prev_event_code 
 Min.   :   34   Min.   : 0.00   Min.   :5    Min.   :  0.00   Min.   : 1.000   Min.   :1.000   Min.   :0     Min.   :  2.00   Min.   :1.252e+12   Min.   :        8   Min.   :-197.0000   Min.   : 1.000  
 1st Qu.:23938   1st Qu.: 4.00   1st Qu.:5    1st Qu.:  1.00   1st Qu.: 1.000   1st Qu.:1.000   1st Qu.:0     1st Qu.:  3.00   1st Qu.:1.252e+12   1st Qu.:      128   1st Qu.:  -1.0000   1st Qu.: 2.000  
 Median :45543   Median :12.00   Median :5    Median :  2.00   Median : 1.000   Median :2.000   Median :0     Median :  5.00   Median :1.253e+12   Median :      338   Median :   0.0000   Median : 2.000  
 Mean   :46897   Mean   :12.44   Mean   :5    Mean   : 13.01   Mean   : 1.679   Mean   :1.555   Mean   :0     Mean   : 18.39   Mean   :1.253e+12   Mean   :    37721   Mean   :   0.3636   Mean   : 3.367  
 3rd Qu.:72086   3rd Qu.:21.00   3rd Qu.:5    3rd Qu.:  5.00   3rd Qu.: 1.000   3rd Qu.:2.000   3rd Qu.:0     3rd Qu.:  9.00   3rd Qu.:1.253e+12   3rd Qu.:     2553   3rd Qu.:   1.0000   3rd Qu.: 5.000  
 Max.   :95135   Max.   :27.00   Max.   :5    Max.   :217.00   Max.   :19.000   Max.   :2.000   Max.   :0     Max.   :352.00   Max.   :1.254e+12   Max.   :100158485   Max.   : 199.0000   Max.   :10.000  
> boxplot(userTabs$tabChangeOffset)
> describe(userTabs$tabChangeOffset)
userTabs$tabChangeOffset 
      n missing  unique    Mean     .05     .10     .25     .50     .75     .90     .95 
  25000       0      73  0.3636      -1      -1      -1       0       1       2       2 
lowest : -197 -135 -131  -91  -74, highest:  181  182  189  197  199 
> hist(userTabs$tabChangeOffset)
> rs <- dbSendQuery(con,"select t1.*, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset
+ from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window and t1.event_code = 2 and t2.event_code =1")
> userTabs <- fetch(rs,25000)
> describe(userTabs$delta)
userTabs$delta 
      n missing  unique    Mean     .05     .10     .25     .50     .75     .90     .95 
    418       0     392    7460   313.8   773.4  1519.2  2097.5  3647.2  8296.6 18505.9 
lowest :     72     75     76     88    103, highest: 122400 137377 279562 406131 410784 
--Frequency of Tab Open/Focus Close Successive .1%
-------------------------------------------------------
 mysql> select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset  from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_window = t2.tab_window and t1.event_code = 1 and t2.event_code <>-1
    -> 
    -> ;
+--------+-------+-----------------+
| N      | delta | tabChangeOffset |
+--------+-------+-----------------+
| 407742 |  3148 |              -1 | 
+--------+-------+-----------------+
1 row in set (29.89 sec)
mysql> select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window and t1.event_code = 2 and t2.event_code =1;
+-----+-------+-----------------+
| N   | delta | tabChangeOffset |
+-----+-------+-----------------+
| 418 |  1271 |               0 | 
+-----+-------+-----------------+
1 row in set (3.04 sec)
--- Tab Focus or Tab Open -> Tab Close, 17% go to close
mysql> select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window and t1.event_code = 2 and (t2.event_code =1 or t2.event_code = 5) ;
+--------+-------+-----------------+
| N      | delta | tabChangeOffset |
+--------+-------+-----------------+
| 149510 |   739 |               0 | 
+--------+-------+-----------------+
1 row in set (9.75 sec)
mysql> select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window and t1.event_code <> -1  and (t2.event_code =1 or t2.event_code = 5) ;
+--------+-------+-----------------+
| N      | delta | tabChangeOffset |
+--------+-------+-----------------+
| 847203 |    26 |               0 | 
+--------+-------+-----------------+
1 row in set (10.15 sec)
-- Investigating Windows & Tab Closing 
----------------------------------------------------
create table vanilla30user (user int, max_tabs int, max_windows int)
insert into vanilla30user (user, max_tabs, max_windows) select user, max(num_tabs), max(tab_window) from vanilla30 group by user
insert into vanilla30userclose (user, closeEvents) select count(*), user from vanilla30 where event_code =2 group by user
-- Specialtialty groups 
----------------------------
-- Does Treestyle promote closing ta
 rs <- dbSendQuery(con,"select user, count(*) as N,
'window' as type,
'close' as action
from treestyle
where event_code = 10
group by user
union 
select user, count(*) as N,
'window' as type,
'open' as action
 from treestyle
where event_code = 9
group by user
union 
select user, count(*) as N,
'tab' as type,
'open' as action
 from treestyle
where event_code = 1
group by user
union 
select user, count(*) as N, 
'tab' as type,
'close' as action
 from treestyle
where event_code = 2
group by user
-- Answer: no
-- Markov Chains
----------------------
CREATE TABLE vanilla30seq
( id MEDIUMINT NOT NULL AUTO_INCREMENT,user int, event_code int, tab_position int, tab_window int, ui_method int, tab_site_hash int, num_tabs int, timestamp bigint,   PRIMARY KEY (id))
insert into vanilla30seq (user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp)
select user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp from vanilla30 order by  user, tab_window, timestamp
mysql> create table vanilla30markovsimple ( N int, delta bigint, tabChangeOffset int, event2 int, event1 int)    -> ;Query OK, 0 rows affected (0.19 sec)
mysql> insert into vanilla30markovsimple (N, delta, tabchangeoffset, event2, event1)  select count(*) as N, t1.timestamp - t2.timestamp as delta, 0 as tabchangeoffset, t1.event_code as event2, t2.event_code as event1 from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window  group by t1.event_code, t2.event_code;
create table treestylemarkovsimple ( N int, delta bigint, tabChangeOffset int, event2 int, event1 int)    
insert into treestylemarkovsimple (N, delta, tabchangeoffset, event2, event1)  
select count(*) as N, t1.timestamp - t2.timestamp as delta, 0 as tabchangeoffset, t1.event_code as event2, t2.event_code as event1 from treestyleseq t1, treestyleseq t2 
where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window  group by t1.event_code, t2.event_code;
-- Histogram of tab deltas
 select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_window = t2.tab_window  group by ( t2.tab_position - t1.tab_position)
 
 -- User Segmentation
 -----------------------------
 create table vanilla30userday
select user, avg(avgTabs) as avgTabs, count(distinct session) as sessions, sum(maxWindows) as totalWindows, sum(maxTabs) as totalTabs, sum(timeActive) as totalTime
from vanilla30userday
select user, avg(avgTabs) as avgTabs, count(sessions) as sessions, sum(maxWindows)-count(sessions) as totalWindows, sum(maxTabs) as totalTabs, sum(timeActive) as totalTime, 
sum(maxTabs) / count(sessions) as TabsPerSession, max(maxTabs) as peakTabs
from vanilla30userday
group by user
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment