Skip to content

Instantly share code, notes, and snippets.

View doryokujin's full-sized avatar

doryokujin doryokujin

View GitHub Profile
@doryokujin
doryokujin / basic_information.sql
Created June 28, 2012 01:25
Book-Crossing Dataset
-- 1.1.1 登録ユーザー数, アクティブユーザー数 --
td query -w -d book_crossing_dataset "
SELECT t1.cnt AS all_users, t2.cnt AS active_users, ROUND(t2.cnt/t1.cnt*100) AS active_rate
FROM
(
SELECT COUNT(distinct user_id) as cnt, 1 AS one
FROM users
) t1
JOIN
(
@doryokujin
doryokujin / user_status.sql
Created June 28, 2012 05:39
Book-Crossing Dataset
-- 2.1.1 ユーザーの居住国分布 --
td query -w -d book_crossing_dataset -f csv -o user_dist_of_country.csv "
SELECT country, cnt, ROUND(cnt/total_cnt*10000)/100 AS rate
FROM
(
SELECT country, COUNT(*) as cnt, 1 AS one
FROM users
WHERE country != '' AND country != 'n/a'
GROUP BY country
ORDER BY cnt DESC
@doryokujin
doryokujin / book_status.sql
Created June 28, 2012 08:07
Book-Crossing Dataset
-- 3.1.1 出版年別分布 --
td query -w -d book_crossing_dataset "
SELECT year, cnt
FROM
(
SELECT 1975 AS year, COUNT(year_of_publication) AS cnt
FROM books
WHERE year_of_publication < 1970
UNION ALL
@doryokujin
doryokujin / review_user_status.sql
Created June 28, 2012 09:28
Book-Crossing Dataset
-- 4.1.1 ユーザーの評価回数平均 --
td query -w -d book_crossing_dataset "
SELECT ROUND(AVG(cnt)) AS avg
FROM
(
SELECT user_id, COUNT(book_rating) AS cnt
FROM ratings
GROUP BY user_id
) t1
"
@doryokujin
doryokujin / analytics.sql
Created July 2, 2012 05:12
Book-Crossing Dataset
-- 5.1.1 毎回9以上の評価しかしないユーザー,毎回5以下の評価しかしないユーザー,いつも同じ評価しかしないユーザー --
-- (0<rating, 5<cnt) --
td query -w -d book_crossing_dataset "
SELECT rating_type, COUNT(*) As cnt
FROM
(
SELECT user_id, MIN(book_rating) AS stat, COUNT(book_rating) AS cnt, '∀Over 9' AS rating_type
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
@doryokujin
doryokujin / analytics2.sql
Created July 2, 2012 13:52
Book-Crossing Dataset
-- 5.4.1 一冊当たりの評価数が多いブックTOP20 --
td query -w -d book_crossing_dataset "
SELECT t1.isbn AS isbn, title, year_of_publication AS year, rating_cnt, ROUND(avg*100)/100 AS avg, ROUND(stddev*100)/100 AS stddev
FROM
(
SELECT isbn, COUNT(book_rating) AS rating_cnt, AVG(book_rating) AS avg, STDDEV_SAMP(book_rating) AS stddev
FROM ratings
WHERE 0 < book_rating
GROUP BY isbn
) t1
@doryokujin
doryokujin / simpson.sql
Created July 2, 2012 17:18
Book-Crossing Dataset
-- 6.1 共起分析:Simpson係数の高いブックペア上位20 --
td query -w -d book_crossing_dataset "
SELECT r2.title AS title1,
r2.year_of_publication AS year1,
r2.cnt AS cnt1,
r3.title AS title2,
r3.year_of_publication AS year2,
r3.cnt AS cnt2,
r1.cnt AS intersection,
r1.cnt/IF(r2.cnt<r3.cnt,r2.cnt,r3.cnt)*100 AS simpson
@doryokujin
doryokujin / tutorial_funnel.sql
Created July 12, 2012 16:43
ファンネル分析
-- 1.1 チュートリアルファンネル(All) --
td query -w -f csv -d your_app -o tutorial_all.csv "
SELECT
t2.step AS step,
cnt,
prev_cnt,
ROUND(cnt/enter_cnt*100) AS rate_from_enter,
IF(prev_cnt IS NULL, 100,ROUND(cnt/prev_cnt*100)) AS rate_from_prev,
IF(prev_cnt IS NULL, 0,ROUND((prev_cnt-cnt)/prev_cnt*100)) AS normed_rate_from_enter
FROM
@doryokujin
doryokujin / 0.basics.sql
Created July 22, 2012 09:04
退会に関する分析
-- 0.1 ログインインターバルの分布 --
-- 当日入会->辞めた人もインターバルは1となるので1回以上プレイしている人に限定 --
td query -w -d your_app -f csv -o dist_of_login_interval.csv "
SELECT ROUND((datediff(latest_login, registered_day)+1)/login_times) AS login_interval, COUNT(*) AS cnt
FROM
(
SELECT v['uid'] AS uid, from_unixtime(MAX(time),'yyyy-MM-dd' ) AS latest_login
FROM login
GROUP BY v['uid']
) t1
SELECT
id, t1.name, model, body_name, color, engine, grade, inspection, recycle,
year,
country_name,
price AS used_price,
large_area_name, shop_pref_name, shop_lat, shop_lng, shop_name,t1.shop_pref_code,
maintenance, maintenance_fee,
odd, odd_numerics, odd_unit,