Skip to content

Instantly share code, notes, and snippets.

@doryokujin
Created July 2, 2012 13:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save doryokujin/3033351 to your computer and use it in GitHub Desktop.
Save doryokujin/3033351 to your computer and use it in GitHub Desktop.
Book-Crossing Dataset
-- 5.4.1 一冊当たりの評価数が多いブックTOP20 --
td query -w -d book_crossing_dataset "
SELECT t1.isbn AS isbn, title, year_of_publication AS year, rating_cnt, ROUND(avg*100)/100 AS avg, ROUND(stddev*100)/100 AS stddev
FROM
(
SELECT isbn, COUNT(book_rating) AS rating_cnt, AVG(book_rating) AS avg, STDDEV_SAMP(book_rating) AS stddev
FROM ratings
WHERE 0 < book_rating
GROUP BY isbn
) t1
JOIN
(
SELECT isbn,
book_title AS title,
year_of_publication
FROM books
) t2
ON
(t1.isbn=t2.isbn)
ORDER BY rating_cnt DESC
LIMIT 20
"
-- 5.4.2 評価数 TOP5 の評価分布 WHERE 0<Rating --
td query -w -d book_crossing_dataset -f csv -o book_dist_of_top5_review_cnt.csv "
SELECT title, book_rating, cnt, total_cnt, ROUND(cnt/total_cnt*1000)/10 AS rate
FROM
(
SELECT t1.isbn AS isbn, book_rating, COUNT(*) AS cnt, total_cnt
FROM ratings t1
JOIN
(
SELECT isbn, COUNT(book_rating) AS total_cnt
FROM ratings
WHERE 0 < book_rating
GROUP BY isbn
ORDER BY total_cnt DESC
LIMIT 5
) t2
ON
(t1.isbn=t2.isbn)
WHERE 0 < book_rating
GROUP BY t1.isbn, book_rating, total_cnt
) o1
JOIN
(
SELECT isbn,
book_title AS title
FROM books
) o2
ON
(o1.isbn=o2.isbn)
ORDER BY total_cnt DESC, book_rating
"
-- 5.5.1 平均が高かったブックベスト20 (50<=cnt) --
td query -w -d book_crossing_dataset "
SELECT t1.isbn AS isbn, title, year_of_publication AS year, ROUND(avg*100)/100 AS avg, cnt, ROUND(stddev*100)/100 AS stddev
FROM
(
SELECT isbn, AVG(book_rating) AS avg, COUNT(book_rating) AS cnt, STDDEV_SAMP(book_rating) AS stddev
FROM ratings
WHERE 0 < book_rating
GROUP BY isbn
HAVING 50 <= COUNT(book_rating)
) t1
JOIN
(
SELECT isbn,
year_of_publication,
book_title AS title
FROM books
) t2
ON
(t1.isbn=t2.isbn)
ORDER BY avg DESC
LIMIT 20
"
-- 5.5.2 評価平均値 TOP5 の評価分布 WHERE 0<Rating --
td query -w -d book_crossing_dataset -f csv -o book_dist_of_top5_review_avg.csv "
SELECT title, book_rating, avg, cnt, total_cnt, ROUND(cnt/total_cnt*1000)/10 AS rate
FROM
(
SELECT t1.isbn AS isbn, book_rating, ROUND(avg/100)*100 AS avg, COUNT(*) AS cnt, total_cnt
FROM ratings t1
JOIN
(
SELECT isbn, AVG(book_rating) AS avg, COUNT(book_rating) AS total_cnt
FROM ratings
WHERE 0 < book_rating
GROUP BY isbn
HAVING 50 <= COUNT(book_rating)
ORDER BY avg DESC
LIMIT 5
) t2
ON
(t1.isbn=t2.isbn)
WHERE 0 < book_rating
GROUP BY t1.isbn, book_rating, total_cnt, avg
) o1
JOIN
(
SELECT isbn,
book_title AS title
FROM books
) o2
ON
(o1.isbn=o2.isbn)
ORDER BY avg DESC, book_rating
"
-- 5.6.1 年代別:平均が高かったブックTOP1 (50<=cnt) --
td query -w -d book_crossing_dataset "
SELECT o1.year_of_publication AS year, title, ROUND(max_this_year*100)/100 AS avg, cnt, ROUND(stddev*100)/100 AS stddev
FROM
(
SELECT year_of_publication, MAX(avg) as max_this_year
FROM
(
SELECT isbn, AVG(book_rating) AS avg, COUNT(book_rating) AS cnt
FROM ratings
WHERE 0 < book_rating
GROUP BY isbn
HAVING 50 <= COUNT(book_rating)
) t1
JOIN
(
SELECT isbn,
year_of_publication
FROM books
) t2
ON
(t1.isbn=t2.isbn)
WHERE 1994 <= year_of_publication
AND year_of_publication <= 2004
GROUP BY year_of_publication
) o1
JOIN
(
SELECT year_of_publication, title, avg, cnt, stddev
FROM
(
SELECT isbn, AVG(book_rating) AS avg, COUNT(book_rating) AS cnt, STDDEV_SAMP(book_rating) AS stddev
FROM ratings
WHERE 0 < book_rating
GROUP BY isbn
HAVING 50 <= COUNT(book_rating)
) t1
JOIN
(
SELECT isbn,
year_of_publication,
book_title AS title
FROM books
) t2
ON
(t1.isbn=t2.isbn)
WHERE 1994 <= year_of_publication
AND year_of_publication <= 2004
) o2
ON
(o1.max_this_year=o2.avg AND o1.year_of_publication=o2.year_of_publication)
ORDER BY year DESC
"
-- 5.6.2 年代別:平均が高かったブックTOP1の分布 (50<=cnt) --
td query -w -d book_crossing_dataset -f csv -o book_dist_of_top1_review_avg_by_year.csv "
SELECT o1.year_of_publication AS year, title, ROUND(max_this_year*100)/100 AS avg, booK_rating, rating_cnt, cnt, ROUND(rating_cnt/cnt*1000)/10 AS rate
FROM
(
SELECT year_of_publication, MAX(avg) as max_this_year
FROM
(
SELECT isbn, AVG(book_rating) AS avg, COUNT(book_rating) AS cnt
FROM ratings
WHERE 0 < book_rating
GROUP BY isbn
HAVING 50 <= COUNT(book_rating)
) t1
JOIN
(
SELECT isbn,
year_of_publication
FROM books
) t2
ON
(t1.isbn=t2.isbn)
WHERE 1994 <= year_of_publication
AND year_of_publication <= 2004
GROUP BY year_of_publication
) o1
JOIN
(
SELECT year_of_publication, t1.isbn AS isbn, title, avg, cnt, stddev
FROM
(
SELECT isbn, AVG(book_rating) AS avg, COUNT(book_rating) AS cnt, STDDEV_SAMP(book_rating) AS stddev
FROM ratings
WHERE 0 < book_rating
GROUP BY isbn
HAVING 50 <= COUNT(book_rating)
) t1
JOIN
(
SELECT isbn,
year_of_publication,
v['book_title'] AS title
FROM books
) t2
ON
(t1.isbn=t2.isbn)
WHERE 1994 <= year_of_publication
AND year_of_publication <= 2004
) o2
ON
(o1.max_this_year=o2.avg AND o1.year_of_publication=o2.year_of_publication)
JOIN
(
SELECT isbn, book_rating, COUNT(*) AS rating_cnt
FROM ratings
WHERE 0 < book_rating
GROUP BY isbn, book_rating
) o3
ON
(o2.isbn=o3.isbn)
ORDER BY year DESC, book_rating
"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment