Book-Crossing Dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- 5.1.1 毎回9以上の評価しかしないユーザー,毎回5以下の評価しかしないユーザー,いつも同じ評価しかしないユーザー -- | |
-- (0<rating, 5<cnt) -- | |
td query -w -d book_crossing_dataset " | |
SELECT rating_type, COUNT(*) As cnt | |
FROM | |
( | |
SELECT user_id, MIN(book_rating) AS stat, COUNT(book_rating) AS cnt, '∀Over 9' AS rating_type | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND 9 <= MIN(book_rating) | |
UNION ALL | |
SELECT user_id, MAX(book_rating) AS stat, COUNT(book_rating) AS cnt, '∀Under 5' AS rating_type | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND MAX(book_rating) <= 5 | |
UNION ALL | |
SELECT user_id, CAST(VARIANCE(book_rating) AS INT) AS stat, COUNT(book_rating) AS cnt, '∀Const' AS rating_type | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND VARIANCE(book_rating) = 0 | |
) t1 | |
GROUP BY rating_type | |
ORDER BY rating_type | |
" | |
-- 5.1.2 毎回8以上の評価しかしないユーザー,毎回6以下の評価しかしないユーザー -- | |
-- (0<rating, 5<cnt) -- | |
td query -w -d book_crossing_dataset " | |
SELECT rating_type, COUNT(*) As cnt | |
FROM | |
( | |
SELECT user_id, MIN(book_rating) AS stat, COUNT(book_rating) AS cnt, '∀Over 8' AS rating_type | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND 8 <= MIN(book_rating) | |
UNION ALL | |
SELECT user_id, MAX(book_rating) AS stat, COUNT(book_rating) AS cnt, '∀Under 6' AS rating_type | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND MAX(book_rating) <= 6 | |
) t1 | |
GROUP BY rating_type | |
ORDER BY rating_type | |
" | |
-- 5.2.1 いつも同じ評価しかしないユーザーの評価分布 (0<rating, 5<cnt) -- | |
td query -w -d book_crossing_dataset " | |
SELECT book_rating, COUNT(*) AS cnt | |
FROM | |
( | |
SELECT user_id, CAST(VARIANCE(book_rating) AS INT) AS var, COUNT(book_rating) AS cnt, AVG(book_rating) AS book_rating | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND VARIANCE(book_rating) = 0 | |
) t1 | |
GROUP BY book_rating | |
ORDER BY book_rating | |
" | |
-- 5.2.2 いつも同じ評価しかしないユーザーの年代分布 0<rating, 5<cnt -- | |
td query -w -d book_crossing_dataset " | |
SELECT o1.generation, cnt, total_cnt, ROUND(cnt/total_cnt*100*100)/100 AS rate | |
FROM | |
( | |
SELECT generation, COUNT(*) AS cnt | |
FROM | |
( | |
SELECT user_id, CAST(VARIANCE(book_rating) AS INT) AS var, COUNT(book_rating) AS cnt, AVG(book_rating) AS book_rating | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND VARIANCE(book_rating) = 0 | |
) t1 | |
JOIN | |
( | |
SELECT user_id, CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 AS generation | |
FROM users | |
WHERE 10 < IF(age='NULL',0,CAST(age AS INT)) | |
AND IF(age='NULL',0,CAST(age AS INT)) <= 65 | |
) t2 | |
ON | |
(t1.user_id=t2.user_id) | |
GROUP BY generation | |
) o1 | |
JOIN | |
( | |
SELECT CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 AS generation, COUNT(*) as total_cnt | |
FROM users | |
WHERE 10 <IF(age='NULL',0,CAST(age AS INT)) <= 65 | |
AND IF(age='NULL',0,CAST(age AS INT)) <= 65 | |
GROUP BY CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 | |
) o2 | |
ON | |
(o1.generation=o2.generation) | |
ORDER BY generation | |
" | |
-- 5.3.1 毎回8以上の評価しかしないユーザーの影響を強く受けた本TOP20 0<rating, 5<cnt -- | |
td query -w -d book_crossing_dataset -f csv -o ranking_of_books_influenced_by_overestimated_users.csv " | |
SELECT total.isbn, | |
title, | |
total.cnt AS total_cnt, | |
outlier.cnt AS outlier_cnt, | |
total.s AS total_s, | |
outlier.s AS outlier_s, | |
total.s-outlier.s AS diff_s, | |
total.cnt-outlier.cnt AS diff_cnt, | |
total.avg AS total_avg, | |
IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) AS eliminated_avg, | |
total.avg - IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) AS diff_avg | |
FROM | |
( | |
SELECT t1.isbn AS isbn, title, cnt, s, avg | |
FROM | |
( | |
SELECT isbn, AVG(book_rating) AS avg, COUNT(book_rating) AS cnt, SUM(book_rating) AS s | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY isbn | |
HAVING 10 <= COUNT(book_rating) | |
) t1 | |
JOIN | |
( | |
SELECT isbn, v['book_title'] AS title | |
FROM books | |
) t2 | |
ON | |
(t1.isbn=t2.isbn) | |
) total | |
JOIN | |
( | |
SELECT t2.isbn AS isbn, COUNT(book_rating) AS cnt, SUM(book_rating) AS s | |
FROM | |
( | |
SELECT user_id, MIN(book_rating) AS mn, COUNT(book_rating) AS cnt | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND 8 <= MIN(book_rating) | |
) t1 | |
JOIN | |
( | |
SELECT isbn, user_id, book_rating | |
FROM ratings | |
WHERE 0 < book_rating | |
) t2 | |
ON | |
(t1.user_id=t2.user_id) | |
GROUP BY t2.isbn | |
) outlier | |
ON | |
(total.isbn=outlier.isbn) | |
ORDER BY diff_avg DESC | |
LIMIT 20 | |
" | |
-- 5.3.2 毎回6以下の評価しかしないユーザーの影響を強く受けた本TOP20 0<rating, 5<cnt -- | |
td query -w -d book_crossing_dataset -f csv -o ranking_of_books_influenced_by_underestimated_users.csv " | |
SELECT total.isbn, | |
title, | |
total.cnt AS total_cnt, | |
outlier.cnt AS outlier_cnt, | |
total.s AS total_s, | |
outlier.s AS outlier_s, | |
total.s-outlier.s AS diff_s, | |
total.cnt-outlier.cnt AS diff_cnt, | |
total.avg AS total_avg, | |
IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) AS eliminated_avg, | |
IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) - total.avg AS diff_avg | |
FROM | |
( | |
SELECT t1.isbn AS isbn, title, cnt, s, avg | |
FROM | |
( | |
SELECT isbn, AVG(book_rating) AS avg, COUNT(book_rating) AS cnt, SUM(book_rating) AS s | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY isbn | |
HAVING 10 <= COUNT(book_rating) | |
) t1 | |
JOIN | |
( | |
SELECT isbn, v['book_title'] AS title | |
FROM books | |
) t2 | |
ON | |
(t1.isbn=t2.isbn) | |
) total | |
JOIN | |
( | |
SELECT t2.isbn AS isbn, COUNT(book_rating) AS cnt, SUM(book_rating) AS s | |
FROM | |
( | |
SELECT user_id, MAX(book_rating) AS mn, COUNT(book_rating) AS cnt | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND MAX(book_rating) <= 6 | |
) t1 | |
JOIN | |
( | |
SELECT isbn, user_id, book_rating | |
FROM ratings | |
WHERE 0 < book_rating | |
) t2 | |
ON | |
(t1.user_id=t2.user_id) | |
GROUP BY t2.isbn | |
) outlier | |
ON | |
(total.isbn=outlier.isbn) | |
ORDER BY diff_avg DESC | |
LIMIT 20 | |
" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment