Skip to content

Instantly share code, notes, and snippets.

@doryokujin
Created July 2, 2012 05:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save doryokujin/3031217 to your computer and use it in GitHub Desktop.
Save doryokujin/3031217 to your computer and use it in GitHub Desktop.
Book-Crossing Dataset
-- 5.1.1 毎回9以上の評価しかしないユーザー,毎回5以下の評価しかしないユーザー,いつも同じ評価しかしないユーザー --
-- (0<rating, 5<cnt) --
td query -w -d book_crossing_dataset "
SELECT rating_type, COUNT(*) As cnt
FROM
(
SELECT user_id, MIN(book_rating) AS stat, COUNT(book_rating) AS cnt, '∀Over 9' AS rating_type
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND 9 <= MIN(book_rating)
UNION ALL
SELECT user_id, MAX(book_rating) AS stat, COUNT(book_rating) AS cnt, '∀Under 5' AS rating_type
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND MAX(book_rating) <= 5
UNION ALL
SELECT user_id, CAST(VARIANCE(book_rating) AS INT) AS stat, COUNT(book_rating) AS cnt, '∀Const' AS rating_type
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND VARIANCE(book_rating) = 0
) t1
GROUP BY rating_type
ORDER BY rating_type
"
-- 5.1.2 毎回8以上の評価しかしないユーザー,毎回6以下の評価しかしないユーザー --
-- (0<rating, 5<cnt) --
td query -w -d book_crossing_dataset "
SELECT rating_type, COUNT(*) As cnt
FROM
(
SELECT user_id, MIN(book_rating) AS stat, COUNT(book_rating) AS cnt, '∀Over 8' AS rating_type
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND 8 <= MIN(book_rating)
UNION ALL
SELECT user_id, MAX(book_rating) AS stat, COUNT(book_rating) AS cnt, '∀Under 6' AS rating_type
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND MAX(book_rating) <= 6
) t1
GROUP BY rating_type
ORDER BY rating_type
"
-- 5.2.1 いつも同じ評価しかしないユーザーの評価分布 (0<rating, 5<cnt) --
td query -w -d book_crossing_dataset "
SELECT book_rating, COUNT(*) AS cnt
FROM
(
SELECT user_id, CAST(VARIANCE(book_rating) AS INT) AS var, COUNT(book_rating) AS cnt, AVG(book_rating) AS book_rating
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND VARIANCE(book_rating) = 0
) t1
GROUP BY book_rating
ORDER BY book_rating
"
-- 5.2.2 いつも同じ評価しかしないユーザーの年代分布 0<rating, 5<cnt --
td query -w -d book_crossing_dataset "
SELECT o1.generation, cnt, total_cnt, ROUND(cnt/total_cnt*100*100)/100 AS rate
FROM
(
SELECT generation, COUNT(*) AS cnt
FROM
(
SELECT user_id, CAST(VARIANCE(book_rating) AS INT) AS var, COUNT(book_rating) AS cnt, AVG(book_rating) AS book_rating
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND VARIANCE(book_rating) = 0
) t1
JOIN
(
SELECT user_id, CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 AS generation
FROM users
WHERE 10 < IF(age='NULL',0,CAST(age AS INT))
AND IF(age='NULL',0,CAST(age AS INT)) <= 65
) t2
ON
(t1.user_id=t2.user_id)
GROUP BY generation
) o1
JOIN
(
SELECT CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 AS generation, COUNT(*) as total_cnt
FROM users
WHERE 10 <IF(age='NULL',0,CAST(age AS INT)) <= 65
AND IF(age='NULL',0,CAST(age AS INT)) <= 65
GROUP BY CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5
) o2
ON
(o1.generation=o2.generation)
ORDER BY generation
"
-- 5.3.1 毎回8以上の評価しかしないユーザーの影響を強く受けた本TOP20 0<rating, 5<cnt --
td query -w -d book_crossing_dataset -f csv -o ranking_of_books_influenced_by_overestimated_users.csv "
SELECT total.isbn,
title,
total.cnt AS total_cnt,
outlier.cnt AS outlier_cnt,
total.s AS total_s,
outlier.s AS outlier_s,
total.s-outlier.s AS diff_s,
total.cnt-outlier.cnt AS diff_cnt,
total.avg AS total_avg,
IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) AS eliminated_avg,
total.avg - IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) AS diff_avg
FROM
(
SELECT t1.isbn AS isbn, title, cnt, s, avg
FROM
(
SELECT isbn, AVG(book_rating) AS avg, COUNT(book_rating) AS cnt, SUM(book_rating) AS s
FROM ratings
WHERE 0 < book_rating
GROUP BY isbn
HAVING 10 <= COUNT(book_rating)
) t1
JOIN
(
SELECT isbn, v['book_title'] AS title
FROM books
) t2
ON
(t1.isbn=t2.isbn)
) total
JOIN
(
SELECT t2.isbn AS isbn, COUNT(book_rating) AS cnt, SUM(book_rating) AS s
FROM
(
SELECT user_id, MIN(book_rating) AS mn, COUNT(book_rating) AS cnt
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND 8 <= MIN(book_rating)
) t1
JOIN
(
SELECT isbn, user_id, book_rating
FROM ratings
WHERE 0 < book_rating
) t2
ON
(t1.user_id=t2.user_id)
GROUP BY t2.isbn
) outlier
ON
(total.isbn=outlier.isbn)
ORDER BY diff_avg DESC
LIMIT 20
"
-- 5.3.2 毎回6以下の評価しかしないユーザーの影響を強く受けた本TOP20 0<rating, 5<cnt --
td query -w -d book_crossing_dataset -f csv -o ranking_of_books_influenced_by_underestimated_users.csv "
SELECT total.isbn,
title,
total.cnt AS total_cnt,
outlier.cnt AS outlier_cnt,
total.s AS total_s,
outlier.s AS outlier_s,
total.s-outlier.s AS diff_s,
total.cnt-outlier.cnt AS diff_cnt,
total.avg AS total_avg,
IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) AS eliminated_avg,
IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) - total.avg AS diff_avg
FROM
(
SELECT t1.isbn AS isbn, title, cnt, s, avg
FROM
(
SELECT isbn, AVG(book_rating) AS avg, COUNT(book_rating) AS cnt, SUM(book_rating) AS s
FROM ratings
WHERE 0 < book_rating
GROUP BY isbn
HAVING 10 <= COUNT(book_rating)
) t1
JOIN
(
SELECT isbn, v['book_title'] AS title
FROM books
) t2
ON
(t1.isbn=t2.isbn)
) total
JOIN
(
SELECT t2.isbn AS isbn, COUNT(book_rating) AS cnt, SUM(book_rating) AS s
FROM
(
SELECT user_id, MAX(book_rating) AS mn, COUNT(book_rating) AS cnt
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND MAX(book_rating) <= 6
) t1
JOIN
(
SELECT isbn, user_id, book_rating
FROM ratings
WHERE 0 < book_rating
) t2
ON
(t1.user_id=t2.user_id)
GROUP BY t2.isbn
) outlier
ON
(total.isbn=outlier.isbn)
ORDER BY diff_avg DESC
LIMIT 20
"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment