Skip to content

Instantly share code, notes, and snippets.

@doryokujin
Created June 28, 2012 09:28
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save doryokujin/3010184 to your computer and use it in GitHub Desktop.
Save doryokujin/3010184 to your computer and use it in GitHub Desktop.
Book-Crossing Dataset
-- 4.1.1 ユーザーの評価回数平均 --
td query -w -d book_crossing_dataset "
SELECT ROUND(AVG(cnt)) AS avg
FROM
(
SELECT user_id, COUNT(book_rating) AS cnt
FROM ratings
GROUP BY user_id
) t1
"
-- 4.1.2 ユーザーの評価回数分布 --
td query -w -d book_crossing_dataset -f csv -o review_user_dist_of_review_count.csv "
SELECT cnt, user_cnt, ROUND(user_cnt/total_user_cnt*1000)/10 AS rate
FROM
(
SELECT cnt, COUNT(*) AS user_cnt, 1 AS one
FROM
(
SELECT user_id, COUNT(book_rating) AS cnt
FROM ratings
GROUP BY user_id
) t1
GROUP BY cnt
ORDER BY cnt
LIMIT 50
) o1
JOIN
(
SELECT COUNT(distinct user_id) AS total_user_cnt, 1 AS one
FROM ratings
) o2
ON
(o1.one=o2.one)
"
-- 4.2 ユーザーの評価平均分布 (0<book_rating, 5<=count(book_rating)) --
td query -w -d book_crossing_dataset -f csv -o review_user_dist_of_review_avg.csv "
SELECT avg, COUNT(*)
FROM
(
SELECT user_id, ROUND(AVG(book_rating)/0.5)*0.5 AS avg, COUNT(book_rating) AS cnt
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 <= COUNT(book_rating)
) t1
GROUP BY avg
ORDER BY avg
"
-- 4.3 ユーザーの年代別×評価平均分布 (0<book_rating, 5<=count(book_rating)) --
td query -w -d book_crossing_dataset -f csv -o review_user_dist_of_review_avg_by_generation.csv "
SELECT o1.generation, avg, ROUND(cnt/total_cnt*1000)/10 AS rate
FROM
(
SELECT generation, avg, COUNT(*) AS cnt
FROM
(
SELECT user_id, ROUND(AVG(book_rating)/0.5)*0.5 AS avg, COUNT(book_rating) AS cnt
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 <= COUNT(book_rating)
) t1
JOIN
(
SELECT user_id, CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 AS generation
FROM users
WHERE 10 < IF(age='NULL',0,CAST(age AS INT))
AND IF(age='NULL',0,CAST(age AS INT)) <= 65
) t2
ON
(t1.user_id=t2.user_id)
GROUP BY generation,avg
) o1
JOIN
(
SELECT generation, COUNT(*) AS total_cnt
FROM
(
SELECT user_id, COUNT(book_rating)
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 <= COUNT(book_rating)
) t1
JOIN
(
SELECT user_id, CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 AS generation
FROM users
WHERE 10 < IF(age='NULL',0,CAST(age AS INT))
AND IF(age='NULL',0,CAST(age AS INT)) <= 65
) t2
ON
(t1.user_id=t2.user_id)
GROUP BY generation
) o2
ON
(o1.generation=o2.generation)
ORDER BY generation, avg
"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment