Created
November 20, 2010 19:58
-
-
Save andyed/708103 to your computer and use it in GitHub Desktop.
code for processing & working with testpilot tab dataset 1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TABLE vanilla30 | |
(user int, event_code int, tab_position int, tab_window int, ui_method int, tab_site_hash int, num_tabs int, timestamp bigint) | |
LOAD DATA LOCAL INFILE '~/vanilla_firefox_users_30_percent.csv' | |
INTO TABLE vanilla30 | |
FIELDS TERMINATED BY ',' | |
LINES TERMINATED BY '\n' | |
(user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp); | |
install.packages("RMySQL",,"http://r.research.att.com/";) | |
CREATE TABLE vanilla30seq | |
( id MEDIUMINT NOT NULL AUTO_INCREMENT,user int, event_code int, tab_position int, tab_window int, ui_method int, tab_site_hash int, num_tabs int, timestamp bigint, PRIMARY KEY (id)) | |
insert into vanilla30seq (user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp) | |
select user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp from vanilla30 order by user, tab_window, timestamp | |
All tab opens | |
-------------- | |
select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_window = t2.tab_window and t1.event_code = 1 and t2.event_code <>-1 | |
select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset | |
from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window and t1.event_code = 2 and (t2.event_code =1 or t2.event_code = 5) group by t1.timestamp - t2.timestamp | |
select count(*) as N, t1.timestamp - t2.timestamp as delta, t3.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2, vanilla30seq t3 where t1.id = t2.id + 1 and t1.id = t3.id + 2 and t1.user = t2.user and t3.user = t1.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window and t1.tab_window = t3.tab_window and (t1.event_code = 2) and (t2.event_code =1 or t2.event_code = 5) | |
select avg(num_tabs) as avgTabs, user, floor(timestamp / 1000 / 60 / 60 ) as day, max(num_tabs) as maxTabs, max(tab_window) as maxWindows from vanilla30seq where timestamp <> 0 and floor(timestamp / 1000 / 60 / 60 ) < 348001 group by user, floor(timestamp / 1000 / 60 / 60 ) | |
select v.*, u.*, c.* from vanilla30 v, vanilla30user u, vanilla30userclose c where c.user = u.user and u.user = v.user | |
select u.*, c.* from, vanilla30user u, vanilla30userclose c where c.user = u.user | |
select user, count(*) as N, | |
max(avg_tabs) as maxTabs, | |
'window' as type, | |
'close' as action | |
from treestyle | |
where event_code = 10 | |
group by user | |
union | |
select user, count(*) as N, | |
'window' as type, | |
'open' as action | |
from treestyle | |
where event_code = 9 | |
group by user | |
union | |
select user, count(*) as N, | |
'tab' as type, | |
'open' as action | |
from treestyle | |
where event_code = 1 | |
group by user | |
union | |
select user, count(*) as N, | |
'tab' as type, | |
'close' as action | |
from treestyle | |
where event_code = 2 | |
group by user | |
select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset, | |
t1.event_code as event2, t2.event_code as event1 | |
from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window | |
group by t1.timestamp - t2.timestamp, t1.event_code, t2.event_code | |
create table vanilla30markov ( N int, delta bigint, tabChangeOffset int, event2 int, event1 int) | |
insert into vanilla30markov (N, delta, tabchangeoffset, event2, event1) | |
select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset, | |
t1.event_code as event2, t2.event_code as event1 | |
from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window | |
group by t1.timestamp - t2.timestamp, t1.event_code, t2.event_code | |
create table vanilla30markovsimple ( N int, delta bigint, tabChangeOffset int, event2 int, event1 int) | |
insert into vanilla30markovsimple (N, delta, tabchangeoffset, event2, event1) select count(*) as N, t1.timestamp - t2.timestamp as delta, t1.event_code as event2, t2.event_code as event1 from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window group by t1.event_code, t2.event_code; | |
CREATE TABLE treestyleseq | |
( id MEDIUMINT NOT NULL AUTO_INCREMENT,user int, event_code int, tab_position int, tab_window int, ui_method int, tab_site_hash int, num_tabs int, timestamp bigint, PRIMARY KEY (id)) | |
insert into treestyleseq (user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp) | |
select user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp from treestyle order by user, tab_window, timestamp | |
create table treestylemarkovsimple ( N int, delta bigint, tabChangeOffset int, event2 int, event1 int) | |
insert into treestylemarkovsimple (N, delta, tabchangeoffset, event2, event1) | |
select count(*) as N, t1.timestamp - t2.timestamp as delta, 0 as tabchangeoffset, t1.event_code as event2, t2.event_code as event1 from treestyleseq t1, treestyleseq t2 | |
where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window group by t1.event_code, t2.event_code; | |
create table vanilla30seq3 ( | |
N int, delta1 bigint, delta2 bigint, tabChangeOffset int, event3 int, event2 int, event1 int | |
) | |
insert into vanilla30seq3 (N, delta1 , delta2 , event3 , event2 , event1 ) | |
values select count(*) as N, t1.timestamp - t2.timestamp as delta1, | |
t2.timestamp - t3.timestamp as delta2, | |
t1.event_code, t2.event_code, t3.event_code | |
from vanilla30seq t1, vanilla30seq t2, vanilla30seq t3 where t1.id = t2.id + 1 and t1.id = t3.id + 2 and t1.user = t2.user and t3.user = t1.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window and t1.tab_window = t3.tab_window group by | |
t1.event_code, t2.event_code, t3.event_code | |
(max(timestamp) - min(timestamp)) / 1000 / 60 / 60 | |
> 24 | |
and (max(timestamp) - min(timestamp)) / 1000 / 60 / 60 | |
< 1000 | |
--- Tab distances | |
select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_window = t2.tab_window and ( t2.tab_position - t1.tab_position) > 0 group by ( t2.tab_position - t1.tab_position) | |
--User Summary | |
-- span, # sessions, mean, median max | |
select count(*) as N, avg(max(timestamp) - min(t2.timestamp)) as avgSessionDuration, user | |
from vanilla30seq t1 | |
group by session, user | |
create table vanilla30usersession (user int, session int, maxTabs int, maxWindows int, avgTabs int, duration bigint, sessionHosts int, sessionPvs int) | |
insert into vanilla30usersession (user, maxtabs, maxwindows, sessions, avgTabs, duration, sessionHosts, sessionPvs) | |
select user, session, max(num_tabs), max(tab_window) , avg(num_tabs) as avgTabs, max(timestamp) - min(timestamp) as duration, count(distinct tab_site_hash) as hosts from vanilla30seq group by user, session | |
-- USER SEGMENTS | |
select avg(num_tabs) as avgTabs, user, floor(timestamp / 1000 / 60 / 60 ) as day, max(num_tabs) as maxTabs, max(tab_window) as maxWindows, | |
count(distinct session) as sessions, max(timestamp) - min(timestamp) as TimeActive | |
from vanilla30seq where timestamp <> 0 and floor(timestamp / 1000 / 60 / 60 ) < 348001 group by user, floor(timestamp / 1000 / 60 / 60 ) | |
select avg(num_tabs) as avgTabs, user, floor(timestamp / 1000 / 60 / 60 ) as day, max(num_tabs) as maxTabs, max(tab_window) as maxWindows, | |
count(distinct session) as sessions, max(timestamp) - min(timestamp) as TimeActive | |
from vanilla30seq where timestamp <> 0 and floor(timestamp / 1000 / 60 / 60 ) < 348001 and max(timestamp) - min(timestamp) > 0 | |
group by user, floor(timestamp / 1000 / 60 / 60 ) | |
create table vanilla30userday | |
select user, avg(avgTabs) as avgTabs, count(distinct session) as sessions, sum(maxWindows) as totalWindows, sum(maxTabs) as totalTabs, sum(timeActive) as totalTime | |
from vanilla30userday | |
select user, avg(avgTabs) as avgTabs, count(sessions) as sessions, sum(maxWindows)-count(sessions) as totalWindows, sum(maxTabs) as totalTabs, sum(timeActive) as totalTime, | |
sum(maxTabs) / count(sessions) as TabsPerSession, max(maxTabs) as peakTabs | |
from vanilla30userday | |
group by user | |
--- MaxTabs > 3*AvgTabs seems to be addict vs binger | |
peakTabs | |
n missing unique Mean .05 .10 .25 .50 .75 .90 .95 | |
805 0 62 16.37 4.0 5.0 7.0 12.0 18.0 31.0 39.8 | |
lowest : 2 3 4 5 6, highest: 159 163 210 224 249 | |
avgTabs | |
n missing unique Mean .05 .10 .25 .50 .75 .90 .95 | |
805 0 803 5.13 1.320 1.538 2.204 3.417 5.637 10.150 14.053 | |
lowest : 0.8994 0.9503 0.9707 1.0449 1.0459, highest: 33.4328 33.6345 41.4546 56.4646 114.8271 | |
- ------ | |
# consolidated round 2 | |
places <- read.csv("/Users/andyedmonds/Documents/places.csv") | |
places$oldest_stamp = as.POSIXct(strptime(as.character(places$visit_date_oldest),format="%m/%d/%y %H:%M")) | |
places$newest_stamp = as.POSIXct(strptime(as.characlter(places$visit_date_newest),format="%m/%d/%y %H:%M")) | |
places$time_delta = (places$newest_stamp - places$oldest_stamp) | |
places$time_delta = difftime(places$newest_stamp,places$oldest_stamp, units="days") | |
places$pages_per_day = places$moz_historyvisits_cnt / as.numeric(places$time_delta) | |
places$bookmark_tagged_pct = (places$bookmark_cnt - places$bookmark_nontag_cnt )/ places$bookmark_cnt | |
places$percent_visits_new = places$places_visited_unique_cnt / places$moz_historyvisits_cnt | |
places$pages_per_day_last90 = places$frecency_fourth_bucket_visit_cnt / 90 | |
places$pages_per_day_last31 = places$frecency_third_bucket_visit_cnt / 31 | |
places$pages_per_day_last4 = places$frecency_first_bucket_visit_cnt / 4 | |
places$pages_per_day_last14 = places$frecency_second_bucket_visit_cnt / 14 | |
places$upages_per_day_last14 = places$frecency_second_bucket_place_unique_cnt / 14 | |
places$upages_per_day_last4 = places$frecency_first_bucket_place_unique_cnt / 4 | |
places$upages_per_day_last31 = places$frecency_third_bucket_place_unique_cnt / 31 | |
places$upages_per_day_last90 = places$frecency_fourth_bucket_place_unique_cnt / 90 | |
places$user_of_bookmarks <- ifelse(places$bookmark_cnt > 30, c("1"), c("0")) | |
places$user_of_livemarks <- ifelse(places$livemark_container_cnt > 1, c("1"), c("0")) | |
places$user_of_tags <- ifelse(places$tag_cnt > 0, c("1"), c("0")) | |
# round 1 - exploratory | |
places <- read.csv("/Users/andyedmonds/Documents/places.csv") | |
places <- read.csv("/Users/andyedmonds/Documents/places.csv") | |
places$oldest_stamp = as.POSIXct(strptime(as.character(places$visit_date_oldest),format="%m/%d/%y %H:%M")) | |
places$newest_stamp = as.POSIXct(strptime(as.character(places$visit_date_newest),format="%m/%d/%y %H:%M")) | |
places$time_delta = (places$newest_stamp - places$oldest_stamp) | |
places$time_delta = difftime(places$newest_stamp,places$oldest_stamp, units="days") | |
places$bookmark_tagged_pct = (places$bookmark_cnt - places$bookmark_nontag_cnt )/ places$bookmark_tagged_pct = (places$bookmark_cnt - places$bookmark_nontag_cnt )/ places$bookmark_cnt | |
places$pages_per_day = places$moz_historyvisits_cnt / as.numeric(places$time_delta) | |
places$folder_cnt_crrctd = places$folder_cnt - places$bookmark_cnt | |
places$percent_visits_new = places$places_visited_unique_cnt / places$moz_historyvisits_cnt | |
taggers <- places[places$tag_cnt > 0,] | |
livemarkers = places[places$livemark_container_cnt > 0,] | |
> places$user_of_tags <- ifelse(places$tag_cnt > 0, c("1"), c("0")) | |
> places$user_of_tags <- ifelse(places$tag_cnt > 0, c("1"), c("0")) | |
> places$user_of_bookmarks <- ifelse(places$bookmark_cnt > 30, c("1"), c("0")) | |
> places$user_of_livemarks <- ifelse(places$livemark_container_cnt > 1, c("1"), c("0")) | |
# Remove the two users over 40k bookmarks | |
places_trim = subset(places, bookmark_cnt < 40000) | |
# Decision Tree clustering | |
library(rpart) | |
fit <- rpart(places_trim$bookmark_cnt ~ places_trim$moz_historyvisits_cnt + places_trim$places_visited_unique_cnt + places_trim$time_delta + places_trim$folder_cnt_crrctd + places_trim$tag_cnt + places_trim$visits_per_place_avg ) | |
post(fit, "/Users/andyedmonds/Documents/fit.ps", title="Bookmark Fit") | |
-------- | |
--- Summary at http://surfmind.com/muzings/?p=505 --- | |
-- aset is pretty large, with 7749 poi | |
MySQL Create | |
------------------- | |
CREATE TABLE vanilla30 | |
(user int, event_code int, tab_position int, tab_window int, ui_method int, tab_site_hash int, num_tabs int, timestamp bigint) | |
LOAD DATA LOCAL INFILE '~/vanilla_firefox_users_30_percent.csv' | |
INTO TABLE vanilla30 | |
FIELDS TERMINATED BY ',' | |
LINES TERMINATED BY '\n' | |
(user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp); | |
CREATE TABLE treestyle | |
(user int, event_code int, tab_position int, tab_window int, ui_method int, tab_site_hash int, num_tabs int, timestamp bigint) | |
LOAD DATA LOCAL INFILE '~/vtree_style_tab_users_70_percent.csv' | |
INTO TABLE t re | |
FIELDS TERMINATED BY ',' | |
LINES TERMINATED BY '\n' | |
(user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp); | |
R Datafile | |
---------------- | |
> tabs <- read.csv(".../Downloads/windows_users_30_percent.csv") | |
> summary(tabs) | |
R Database | |
------------------ | |
> library(RMySQL) | |
Loading required package: DBI | |
> con <- dbConnect(MySQL(), user="root", dbname="testpilot", host = "localhost") | |
> rs <- dbSendQuery(con,"select avg(num_tabs) as avgTabs, user, max(num_tabs) as maxTabs from vanilla30 group by user") | |
> userTabs <- fetch(rs) | |
> summary(userTabs) | |
avgTabs user maxTabs | |
Min. : 0.9102 Min. : 0.0 Min. : 2.00 | |
1st Qu.: 2.7464 1st Qu.:124.8 1st Qu.: 8.00 | |
Median : 4.1059 Median :249.5 Median : 13.00 | |
Mean : 6.0415 Mean :249.5 Mean : 19.08 | |
3rd Qu.: 6.7097 3rd Qu.:374.2 3rd Qu.: 22.00 | |
Max. :178.6778 Max. :499.0 Max. :352.00 | |
> hist(userTabs$avgTabs) | |
> boxplot(userTabs$avgTabs) | |
R Sampling Users by Day | |
---------------------------------- | |
> con <- dbConnect(MySQL(), user="root", dbname="testpilot", host = "localhost") | |
> rs <- dbSendQuery(con,"select avg(num_tabs) as avgTabs, user, floor(timestamp / 1000 / 60 / 60 / 24) as day, max(num_tabs) as maxTabs from vanilla30 where timestamp <> 0 and floor(timestamp / 1000 / 60 / 60 / 24) < 14533 group by user, floor(timestamp / 1000 / 60 / 60 / 24)") | |
> userTabs <- fetch(rs,100000) | |
> describe(userTabs) | |
userTabs | |
4 Variables 7746 Observations | |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
avgTabs | |
n missing unique Mean .05 .10 .25 .50 .75 .90 .95 | |
7746 0 6767 5.595 1.151 1.429 2.179 3.575 5.993 10.364 15.742 | |
lowest : 0.0000 0.5000 0.5714 0.6000 0.6250, highest: 179.9956 181.1964 191.1681 200.4904 204.6459 | |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
user | |
n missing unique Mean .05 .10 .25 .50 .75 .90 .95 | |
7746 0 939 464.4 43.25 92.00 234.00 461.50 696.00 837.00 890.00 | |
lowest : 0 1 2 3 4, highest: 934 935 936 937 938 | |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
day | |
n missing unique Mean .05 .10 .25 .50 .75 .90 .95 | |
7746 0 22 14499 14492 14493 14496 14499 14503 14505 14508 | |
lowest : 14490 14491 14492 14493 14494, highest: 14507 14508 14509 14510 14511 | |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
maxTabs | |
n missing unique Mean .05 .10 .25 .50 .75 .90 .95 | |
7746 0 110 10.92 2 3 4 8 13 20 29 | |
lowest : 0 1 2 3 4, highest: 235 237 238 249 352 | |
R Sampling Users by Hour | |
------------------------------------ | |
> rs <- dbSendQuery(con,"select avg(num_tabs) as avgTabs, user, floor(timestamp / 1000 / 60 / 60 ) as day, max(num_tabs) as maxTabs from vanilla30 where timestamp <> 0 and floor(timestamp / 1000 / 60 / 60 ) < 348001 group by user, floor(timestamp / 1000 / 60 / 60 )") | |
> userTabs <- fetch(rs,25000) | |
> summary(userTabs) | |
avgTabs user day maxTabs | |
Min. : 0.000 Min. : 0.0 Min. :347760 Min. : 0.000 | |
1st Qu.: 1.833 1st Qu.:188.0 1st Qu.:347870 1st Qu.: 3.000 | |
Median : 3.116 Median :375.0 Median :347915 Median : 5.000 | |
Mean : 5.398 Mean :368.3 Mean :347908 Mean : 7.346 | |
3rd Qu.: 5.949 3rd Qu.:546.0 3rd Qu.:347949 3rd Qu.: 9.000 | |
Max. :209.449 Max. :720.0 Max. :348000 Max. :249.000 | |
> describe(userTabs) | |
userTabs | |
4 Variables 25000 Observations | |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
avgTabs | |
n missing unique Mean .05 .10 .25 .50 .75 .90 .95 | |
25000 0 12456 5.398 0.8571 1.0000 1.8333 3.1159 5.9488 10.9396 16.6456 | |
lowest : 0.0000 0.1429 0.1818 0.2222 0.2500, highest: 180.2347 185.4545 195.6218 199.5410 209.4493 | |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
user | |
n missing unique Mean .05 .10 .25 .50 .75 .90 .95 | |
25000 0 625 368.3 40 86 188 375 546 651 681 | |
lowest : 0 1 2 3 4, highest: 716 717 718 719 720 | |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
day | |
n missing unique Mean .05 .10 .25 .50 .75 .90 .95 | |
25000 0 227 347908 347817 347830 347870 347915 347949 347973 347988 | |
lowest : 347760 347761 347764 347774 347775, highest: 347996 347997 347998 347999 348000 | |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
maxTabs | |
n missing unique Mean .05 .10 .25 .50 .75 .90 .95 | |
25000 0 139 7.346 1 1 3 5 9 15 21 | |
lowest : 0 1 2 3 4, highest: 204 208 210 224 249 | |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
> | |
Including Windows | |
---------------------------- | |
> con <- dbConnect(MySQL(), user="root", dbname="testpilot", host = "localhost") | |
> rs <- dbSendQuery(con,"select avg(num_tabs) as avgTabs, user, max(tab_window) as windows, floor(timestamp / 1000 / 60 / 60 / 24) as day, max(num_tabs) as maxTabs from vanilla30 where timestamp <> 0 and floor(timestamp / 1000 / 60 / 60 / 24) < 14533 group by user, floor(timestamp / 1000 / 60 / 60 / 24)") | |
> userTabs <- fetch(rs,25000) | |
Crafting a table for self-joins and timing | |
---------------------------------------------------- | |
insert into vanilla30seq (user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp) | |
-> select user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp from vanilla30 order by user, tab_window, timestamp | |
select t1.*, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position, t2.ui_method as prev_ui_method from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.tab_window = t2.tab_window l | |
Tab Switch Distance | |
----------------------------- | |
> con <- dbConnect(MySQL(), user="root", dbname="testpilot", host = "localhost") | |
> rs <- dbSendQuery(con,"select t1.*, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset, t2.event_code as prev_event_code from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.tab_window = t2.tab_window and t1.event_code=5 ") | |
> userTabs <- fetch(rs,25000) | |
> summary(userTabs) | |
id user event_code tab_position tab_window ui_method tab_site_hash num_tabs timestamp delta tabChangeOffset prev_event_code | |
Min. : 34 Min. : 0.00 Min. :5 Min. : 0.00 Min. : 1.000 Min. :1.000 Min. :0 Min. : 2.00 Min. :1.252e+12 Min. : 8 Min. :-197.0000 Min. : 1.000 | |
1st Qu.:23938 1st Qu.: 4.00 1st Qu.:5 1st Qu.: 1.00 1st Qu.: 1.000 1st Qu.:1.000 1st Qu.:0 1st Qu.: 3.00 1st Qu.:1.252e+12 1st Qu.: 128 1st Qu.: -1.0000 1st Qu.: 2.000 | |
Median :45543 Median :12.00 Median :5 Median : 2.00 Median : 1.000 Median :2.000 Median :0 Median : 5.00 Median :1.253e+12 Median : 338 Median : 0.0000 Median : 2.000 | |
Mean :46897 Mean :12.44 Mean :5 Mean : 13.01 Mean : 1.679 Mean :1.555 Mean :0 Mean : 18.39 Mean :1.253e+12 Mean : 37721 Mean : 0.3636 Mean : 3.367 | |
3rd Qu.:72086 3rd Qu.:21.00 3rd Qu.:5 3rd Qu.: 5.00 3rd Qu.: 1.000 3rd Qu.:2.000 3rd Qu.:0 3rd Qu.: 9.00 3rd Qu.:1.253e+12 3rd Qu.: 2553 3rd Qu.: 1.0000 3rd Qu.: 5.000 | |
Max. :95135 Max. :27.00 Max. :5 Max. :217.00 Max. :19.000 Max. :2.000 Max. :0 Max. :352.00 Max. :1.254e+12 Max. :100158485 Max. : 199.0000 Max. :10.000 | |
> boxplot(userTabs$tabChangeOffset) | |
> describe(userTabs$tabChangeOffset) | |
userTabs$tabChangeOffset | |
n missing unique Mean .05 .10 .25 .50 .75 .90 .95 | |
25000 0 73 0.3636 -1 -1 -1 0 1 2 2 | |
lowest : -197 -135 -131 -91 -74, highest: 181 182 189 197 199 | |
> hist(userTabs$tabChangeOffset) | |
> rs <- dbSendQuery(con,"select t1.*, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset | |
+ from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window and t1.event_code = 2 and t2.event_code =1") | |
> userTabs <- fetch(rs,25000) | |
> describe(userTabs$delta) | |
userTabs$delta | |
n missing unique Mean .05 .10 .25 .50 .75 .90 .95 | |
418 0 392 7460 313.8 773.4 1519.2 2097.5 3647.2 8296.6 18505.9 | |
lowest : 72 75 76 88 103, highest: 122400 137377 279562 406131 410784 | |
> | |
--Frequency of Tab Open/Focus Close Successive .1% | |
------------------------------------------------------- | |
mysql> select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_window = t2.tab_window and t1.event_code = 1 and t2.event_code <>-1 | |
-> | |
-> ; | |
+--------+-------+-----------------+ | |
| N | delta | tabChangeOffset | | |
+--------+-------+-----------------+ | |
| 407742 | 3148 | -1 | | |
+--------+-------+-----------------+ | |
1 row in set (29.89 sec) | |
mysql> select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window and t1.event_code = 2 and t2.event_code =1; | |
+-----+-------+-----------------+ | |
| N | delta | tabChangeOffset | | |
+-----+-------+-----------------+ | |
| 418 | 1271 | 0 | | |
+-----+-------+-----------------+ | |
1 row in set (3.04 sec) | |
--- Tab Focus or Tab Open -> Tab Close, 17% go to close | |
mysql> select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window and t1.event_code = 2 and (t2.event_code =1 or t2.event_code = 5) ; | |
+--------+-------+-----------------+ | |
| N | delta | tabChangeOffset | | |
+--------+-------+-----------------+ | |
| 149510 | 739 | 0 | | |
+--------+-------+-----------------+ | |
1 row in set (9.75 sec) | |
mysql> select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window and t1.event_code <> -1 and (t2.event_code =1 or t2.event_code = 5) ; | |
+--------+-------+-----------------+ | |
| N | delta | tabChangeOffset | | |
+--------+-------+-----------------+ | |
| 847203 | 26 | 0 | | |
+--------+-------+-----------------+ | |
1 row in set (10.15 sec) | |
-- Investigating Windows & Tab Closing | |
---------------------------------------------------- | |
create table vanilla30user (user int, max_tabs int, max_windows int) | |
insert into vanilla30user (user, max_tabs, max_windows) select user, max(num_tabs), max(tab_window) from vanilla30 group by user | |
insert into vanilla30userclose (user, closeEvents) select count(*), user from vanilla30 where event_code =2 group by user | |
-- Specialtialty groups | |
---------------------------- | |
-- Does Treestyle promote closing ta | |
rs <- dbSendQuery(con,"select user, count(*) as N, | |
'window' as type, | |
'close' as action | |
from treestyle | |
where event_code = 10 | |
group by user | |
union | |
select user, count(*) as N, | |
'window' as type, | |
'open' as action | |
from treestyle | |
where event_code = 9 | |
group by user | |
union | |
select user, count(*) as N, | |
'tab' as type, | |
'open' as action | |
from treestyle | |
where event_code = 1 | |
group by user | |
union | |
select user, count(*) as N, | |
'tab' as type, | |
'close' as action | |
from treestyle | |
where event_code = 2 | |
group by user | |
-- Answer: no | |
-- Markov Chains | |
---------------------- | |
CREATE TABLE vanilla30seq | |
( id MEDIUMINT NOT NULL AUTO_INCREMENT,user int, event_code int, tab_position int, tab_window int, ui_method int, tab_site_hash int, num_tabs int, timestamp bigint, PRIMARY KEY (id)) | |
insert into vanilla30seq (user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp) | |
select user, event_code, tab_position, tab_window, ui_method, tab_site_hash, num_tabs, timestamp from vanilla30 order by user, tab_window, timestamp | |
mysql> create table vanilla30markovsimple ( N int, delta bigint, tabChangeOffset int, event2 int, event1 int) -> ;Query OK, 0 rows affected (0.19 sec) | |
mysql> insert into vanilla30markovsimple (N, delta, tabchangeoffset, event2, event1) select count(*) as N, t1.timestamp - t2.timestamp as delta, 0 as tabchangeoffset, t1.event_code as event2, t2.event_code as event1 from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window group by t1.event_code, t2.event_code; | |
create table treestylemarkovsimple ( N int, delta bigint, tabChangeOffset int, event2 int, event1 int) | |
insert into treestylemarkovsimple (N, delta, tabchangeoffset, event2, event1) | |
select count(*) as N, t1.timestamp - t2.timestamp as delta, 0 as tabchangeoffset, t1.event_code as event2, t2.event_code as event1 from treestyleseq t1, treestyleseq t2 | |
where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_position = t2.tab_position and t1.tab_window = t2.tab_window group by t1.event_code, t2.event_code; | |
-- Histogram of tab deltas | |
select count(*) as N, t1.timestamp - t2.timestamp as delta, t2.tab_position - t1.tab_position as tabChangeOffset from vanilla30seq t1, vanilla30seq t2 where t1.id = t2.id + 1 and t1.user = t2.user and t1.tab_window = t2.tab_window group by ( t2.tab_position - t1.tab_position) | |
-- User Segmentation | |
----------------------------- | |
create table vanilla30userday | |
select user, avg(avgTabs) as avgTabs, count(distinct session) as sessions, sum(maxWindows) as totalWindows, sum(maxTabs) as totalTabs, sum(timeActive) as totalTime | |
from vanilla30userday | |
select user, avg(avgTabs) as avgTabs, count(sessions) as sessions, sum(maxWindows)-count(sessions) as totalWindows, sum(maxTabs) as totalTabs, sum(timeActive) as totalTime, | |
sum(maxTabs) / count(sessions) as TabsPerSession, max(maxTabs) as peakTabs | |
from vanilla30userday | |
group by user | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment