Last active
April 22, 2016 08:24
-
-
Save cavedave/48cb7d25330874af6b2b21c97ae1fa9d to your computer and use it in GitHub Desktop.
Code for comparing the ratings of books with their cover ratings. Blogpost on the analysis http://liveatthewitchtrials.blogspot.ie/2016/04/can-you-judge-book-by-its-cover.html Original dataset https://www.reddit.com/r/datasets/comments/3jiu4m/dataset_judgey_700k_ratings_for_100_book_covers/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mydata = read.csv("output.csv", encoding="UTF-8", header=TRUE) | |
#book_id title rating num_ratings num_reviews book_id x.x x.y cover_rating | |
attach(mydata) | |
plot(rating,num_reviews,main="Number of Ratings Number Reviews") | |
cor(num_ratings,num_reviews) | |
> cor(num_ratings,num_reviews) | |
[1] 0.9597442 | |
> cor(rating,cover_rating) | |
[1] 0.1609114 | |
cor(rating,num_ratings) | |
cor(rating,num_reviews) | |
cor(num_ratings,cover_rating) | |
cor(num_reviews,cover_rating) | |
> cor(rating,cover_rating) | |
[1] 0.1609114 | |
> cor(rating,cover_rating) | |
[1] 0.1609114 | |
> cor(book_id,book_id2) | |
[1] 1 | |
> cor(rating,num_ratings) | |
[1] 0.2141307 | |
> cor(rating,num_reviews) | |
[1] 0.2658916 | |
> cor(num_ratings,cover_rating) | |
[1] 0.3059627 | |
> cor(num_reviews,cover_rating) | |
[1] 0.3307553 | |
plot(rating,cover_rating,main="Judging a book by its cover") | |
library(ggplot2) | |
p1 <- ggplot(mydata, aes(x = rating, y = cover_rating)) | |
p1 <- p1 + labs(x="Avg Cover Rating", | |
y = "Avg Book Rating") | |
p1 <- p1 + ggtitle("Can you Judge a Book by its Cover?") + | |
theme(plot.title = element_text(lineheight=.8, face="bold")) | |
p1 <-p1 + annotate("text", x = 3, y = 1.5, label = "Correlation book and cover ratings is 0.16") | |
p1 <-p1 + geom_point() | |
p1 | |
ggsave("Cover.png", width=10, height=10, dpi=300) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
array = [1301,2165,2657,5246,7672,10210,14497,18342,19501,22034,22628,22635,33600,41865,43035,80467,105578,201114,297673,405784,439965,468328,469410,471381,505576,581332,825139,898885,1576816,1850833,1868343,1971304,1988626,2767052,3189111,3565645,4312691,4922079,5632446,6263078,6442769,6609765,6691280,6837103,6883046,7031817,7381740,8135213,8331609,8491980,8694331,8697395,9627849,9661374,9778945,10308500,10644930,10866398,12032357,12291438,12875355,13449852,13579626,13589182,15793049,15797397,15994634,16131077,16158542,16278318,17255444,17333230,17737039,18050053,18077769,18225810,18296030,18302455,18310201,18652002,18667945,19542841,20263206,20575425,20706269,20821614,20892558,20897517,20980987,21416678,21944886,22232035,22465597,22674531,22822858,23014725,23214378,23492682,23747672,24490083] | |
array.each { |x| | |
# missing 22635,22034,22628 | |
print x.to_s+", " | |
doc = Nokogiri::HTML(open("https://www.goodreads.com/book/show/"+(x).to_s)) | |
doc.xpath('//h1').each do |node| | |
node=node.to_str.strip | |
print node+", " | |
end | |
#<span class="average" itemprop="ratingValue">4.23</span> | |
ratings = doc.css('.itemprop') | |
ratings.each do |rating| | |
print rating.text+", " | |
end | |
#<span class="average" itemprop="ratingValue">4.23</span> | |
ratings = doc.css('.average') | |
ratings.each do |rating| | |
print rating.text+", " | |
end | |
#<span class="value-title" title="62333" itemprop="ratingCount">62,333 Ratings</span> | |
counts = doc.css('.value-title') | |
counts.each do |count| | |
print count.text+", " | |
end | |
puts "" | |
sleep(5) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
book_id | title | rating | num_ratings | num_reviews | book_idCover | num_rating | total_rating | cover_rating | |
---|---|---|---|---|---|---|---|---|---|
1301 | Moneyball: The Art of Winning an Unfair Game | 4.23 | 62333 | 3613 | 1301 | 5248 | 16110.3 | 3.069798018 | |
2165 | The Old Man and the Sea | 3.69 | 459620 | 13320 | 2165 | 1569 | 5643 | 3.596558317 | |
2657 | To Kill a Mockingbird | 4.24 | 2807869 | 62159 | 2657 | 1564 | 6320.1 | 4.040984655 | |
5246 | Ethan Frome | 3.32 | 67479 | 3644 | 5246 | 2289 | 7615.3 | 3.326911315 | |
7672 | Congo | 3.52 | 120930 | 1399 | 7672 | 1998 | 5984.7 | 2.995345345 | |
10210 | Jane Eyre | 4.08 | 1135632 | 26931 | 10210 | 1977 | 6983.2 | 3.532220536 | |
14497 | Neverwhere | 4.16 | 246191 | 12500 | 14497 | 4363 | 15036.5 | 3.446367179 | |
18342 | It | 4.14 | 381656 | 8254 | 18342 | 2021 | 6742.1 | 3.336021771 | |
19501 | Eat Pray Love | 3.48 | 1072053 | 46874 | 19501 | 2764 | 9088.3 | 3.288096961 | |
22034 | The Godfather (Mario Puzo's Mafia) | 4.35 | 241113 | 4787 | 22034 | 2791 | 10563.9 | 3.78498746 | |
22628 | The Perks of Being a Wallflower | 4.2 | 813120 | 41549 | 22628 | 1641 | 5353.4 | 3.262279098 | |
22635 | Darwin on Trial | 3.89 | 1199 | 73 | 22635 | 47444 | 133162.5 | 2.80673004 | |
33600 | Shantaram | 4.25 | 90669 | 8927 | 33600 | 2425 | 7815.7 | 3.222969072 | |
41865 | Twilight | 3.56 | 3385599 | 90011 | 41865 | 1874 | 5351.1 | 2.855442903 | |
43035 | White Fang | 3.91 | 96681 | 2473 | 43035 | 2064 | 7071.2 | 3.425968992 | |
80467 | Him Her Him Again the End of Him | 2.85 | 1541 | 378 | 80467 | 2833 | 8920.9 | 3.148923403 | |
105578 | One Night at the Call Center | 2.45 | 38228 | 1090 | 105578 | 5103 | 14454.1 | 2.832471095 | |
201114 | Sold | 4.16 | 32631 | 4071 | 201114 | 2075 | 6974 | 3.360963855 | |
297673 | The Brief Wondrous Life of Oscar Wao | 3.87 | 154969 | 14842 | 297673 | 1942 | 6419.6 | 3.305664264 | |
405784 | Fangland | 2.62 | 871 | 176 | 405784 | 5013 | 13938.9 | 2.780550569 | |
439965 | The Twins of Tribeca | 2.89 | 457 | 51 | 439965 | 2629 | 6735.8 | 2.562114873 | |
468328 | Notting Hell | 2.83 | 693 | 112 | 468328 | 50292 | 119860.6 | 2.383293566 | |
469410 | Urienâs Voyage | 3.4 | 75 | 9 | 469410 | 1730 | 5492.6 | 3.174913295 | |
471381 | You Have to Kiss a Lot of Frogs | 2.84 | 974 | 86 | 471381 | 4829 | 12179.6 | 2.522178505 | |
505576 | Top Girls | 3.55 | 3784 | 127 | 505576 | 2537 | 6785 | 2.674418605 | |
581332 | The Third Translation | 2.53 | 367 | 60 | 581332 | 3000 | 9122.1 | 3.0407 | |
825139 | Wedding Season | 2.82 | 661 | 94 | 825139 | 1943 | 4908 | 2.525990736 | |
898885 | Loving Frank | 3.71 | 69363 | 8400 | 898885 | 2979 | 9040 | 3.034575361 | |
1576816 | American Nerd: The Story of My People | 2.98 | 1313 | 316 | 1576816 | 2290 | 7465.2 | 3.259912664 | |
1850833 | Beautiful Children | 2.91 | 1681 | 408 | 1850833 | 1553 | 4762.7 | 3.066773986 | |
1868343 | The Spa | 2.81 | 381 | 73 | 1868343 | 2555 | 6632 | 2.595694716 | |
1971304 | City of Thieves | 4.27 | 63789 | 7230 | 1971304 | 2455 | 8336 | 3.395519348 | |
1988626 | The Pig Did It | 2.87 | 1183 | 316 | 1988626 | 2758 | 7516.2 | 2.725235678 | |
2767052 | The Hunger Games | 4.36 | 4147206 | 147596 | 2767052 | 1779 | 6376.5 | 3.584317032 | |
3189111 | Handcuffs | 3.46 | 418 | 87 | 3189111 | 3149 | 7366.2 | 2.3392188 | |
3565645 | Collections of Nothing | 2.94 | 183 | 61 | 3565645 | 4699 | 14890 | 3.16875931 | |
4312691 | Enclave | 2.75 | 186 | 42 | 4312691 | 4094 | 12331 | 3.011968735 | |
4922079 | One Second After | 3.92 | 25289 | 3767 | 4922079 | 2257 | 6513.6 | 2.885954807 | |
5632446 | Columbine | 4.25 | 40368 | 5253 | 5632446 | 1580 | 5465.6 | 3.459240506 | |
6263078 | Miles to Go | 3.5 | 7892 | 845 | 6263078 | 2028 | 3666.8 | 1.808086785 | |
6442769 | Paper Towns | 3.93 | 569295 | 38093 | 6442769 | 3489 | 12326.4 | 3.532932072 | |
6609765 | Out of My Mind | 4.36 | 53538 | 7888 | 6609765 | 3055 | 9629.9 | 3.152176759 | |
6691280 | Mr. Darcy Vampyre | 2.89 | 4287 | 799 | 6691280 | 1508 | 4240.7 | 2.812135279 | |
6837103 | The Kitchen House | 4.16 | 144493 | 13490 | 6837103 | 2941 | 8892.1 | 3.02349541 | |
6883046 | Shadow Tag | 3.35 | 5118 | 1218 | 6883046 | 2056 | 6535.8 | 3.178891051 | |
7031817 | Fat Vampire: A Never Coming of Age Story | 2.86 | 2272 | 591 | 7031817 | 48576 | 115897.5 | 2.385900445 | |
7381740 | The Rehearsal | 3.4 | 2934 | 491 | 7381740 | 1792 | 5485.2 | 3.0609375 | |
8135213 | Justin Bieber: His World | 4.44 | 778 | 36 | 8135213 | 1795 | 2689.4 | 1.498272981 | |
8331609 | Alison Wonderland | 2.43 | 1313 | 282 | 8331609 | 51249 | 125816 | 2.454994244 | |
8491980 | People Who Eat Darkness: The Fate of Lucie Blackman | 3.73 | 7784 | 1037 | 8491980 | 1762 | 4813.5 | 2.73183882 | |
8694331 | Waiting to Live | 4.45 | 56 | 27 | 8694331 | 4742 | 11296.5 | 2.382222691 | |
8697395 | Cleaving: A Story of Marriage Meat and Obsession | 2.4 | 3695 | 964 | 8697395 | 42372 | 129682.9 | 3.0605801 | |
9627849 | Games to Play After Dark | 2.69 | 257 | 70 | 9627849 | 3232 | 9542.3 | 2.952444307 | |
9661374 | Lunch-Box Dream | 2.69 | 249 | 72 | 9661374 | 1690 | 5315.8 | 3.145443787 | |
9778945 | The Shallows: What the Internet Is Doing to Our Brains | 3.83 | 12075 | 1764 | 9778945 | 2812 | 8354.2 | 2.970910384 | |
10308500 | The Tennis Party | 2.82 | 3108 | 363 | 10308500 | 1412 | 3624.9 | 2.567209632 | |
10644930 | 11/22/63 | 4.27 | 237246 | 26385 | 10644930 | 4551 | 15537.8 | 3.414150736 | |
10866398 | A Study in Sherlock: Stories Inspired by the Holmes Canon | 3.75 | 2726 | 346 | 10866398 | 1464 | 4936.4 | 3.371857923 | |
12032357 | Domingo's Angel | 4.62 | 68 | 36 | 12032357 | 52315 | 93514.4 | 1.787525566 | |
12291438 | The Madmanâs Daughter | 3.7 | 16486 | 2904 | 12291438 | 5346 | 15793.3 | 2.95422746 | |
12875355 | Death Comes to Pemberley | 3.16 | 64819 | 8138 | 12875355 | 4772 | 15058.7 | 3.155637049 | |
13449852 | 17 | 3.54 | 212 | 10 | 13449852 | 2729 | 7822.6 | 2.866471235 | |
13579626 | Rooms | 3.3 | 8771 | 1439 | 13579626 | 2716 | 9408.3 | 3.464027982 | |
13589182 | Mastery | 4.24 | 9567 | 655 | 13589182 | 2313 | 6712.8 | 2.902204929 | |
15793049 | Farside | 3.11 | 942 | 179 | 15793049 | 3268 | 10571.4 | 3.234822521 | |
15797397 | Salt Sugar Fat: How the Food Giants Hooked Us | 3.96 | 17657 | 2307 | 15797397 | 5141 | 14400.8 | 2.801167088 | |
15994634 | Z: A Novel of Zelda Fitzgerald | 3.8 | 29527 | 3352 | 15994634 | 4003 | 14169.3 | 3.539670247 | |
16131077 | The Shining Girls | 3.48 | 28152 | 3996 | 16131077 | 1823 | 5732.3 | 3.144432255 | |
16158542 | The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics | 4.3 | 91671 | 11798 | 16158542 | 2467 | 7658.6 | 3.104418322 | |
16278318 | Armada | 3.46 | 35747 | 6563 | 16278318 | 3194 | 9626.3 | 3.013869756 | |
17255444 | The Orpheus Descent | 3.22 | 295 | 67 | 17255444 | 2418 | 7655.7 | 3.166129032 | |
17333230 | The Luminaries | 3.68 | 40872 | 5801 | 17333230 | 5686 | 18447.3 | 3.244336968 | |
17737039 | Happier at Home: How I Learned to Pay Attention Cram My Day with What I Love Hold More Tightly Embrace Here and Remember Now | 3.46 | 10116 | 1419 | 17737039 | 4802 | 13354.5 | 2.781028738 | |
18050053 | The Moon Sisters | 3.53 | 1196 | 326 | 18050053 | 4572 | 13179 | 2.882545932 | |
18077769 | Authority | 3.58 | 14963 | 1970 | 18077769 | 2062 | 6504.9 | 3.154655674 | |
18225810 | Sisters | 4.28 | 18554 | 2210 | 18225810 | 3333 | 8970.5 | 2.691419142 | |
18296030 | Don't Look Back | 4.11 | 17639 | 2711 | 18296030 | 1970 | 5789.9 | 2.939035533 | |
18302455 | The Circle | 3.44 | 75782 | 11068 | 18302455 | 2812 | 8921.9 | 3.172795164 | |
18310201 | The Painter | 3.76 | 8664 | 1196 | 18310201 | 3750 | 11734 | 3.129066667 | |
18652002 | The Bees | 3.66 | 14258 | 2760 | 18652002 | 2346 | 7462.8 | 3.181074169 | |
18667945 | #GIRLBOSS | 3.72 | 23047 | 1811 | 18667945 | 2611 | 6962 | 2.666411337 | |
19542841 | More Happy Than Not | 4.1 | 5760 | 1558 | 19542841 | 1425 | 4282 | 3.004912281 | |
20263206 | The Buried Life | 3.29 | 319 | 95 | 20263206 | 3652 | 11512 | 3.152245345 | |
20575425 | Wolf in White Van | 3.73 | 12143 | 1868 | 20575425 | 49512 | 154889.5 | 3.128322427 | |
20706269 | Broken Monsters | 3.6 | 10362 | 1663 | 20706269 | 2887 | 8655 | 2.997921718 | |
20821614 | You | 3.82 | 22806 | 4712 | 20821614 | 2443 | 7032.7 | 2.878714695 | |
20892558 | Lamentation | 4.35 | 5029 | 645 | 20892558 | 2850 | 8238.8 | 2.890807018 | |
20897517 | In the Kingdom of Ice: The Grand and Terrible Polar Voyage of the USS Jeannette | 4.15 | 10639 | 1483 | 20897517 | 2153 | 6594.3 | 3.062842545 | |
20980987 | The Art of Asking; or How I Learned to Stop Worrying and Let People Help | 3.94 | 17979 | 1801 | 20980987 | 2678 | 8057.7 | 3.008849888 | |
21416678 | The Bishopâs Wife | 3.09 | 2404 | 623 | 21416678 | 1574 | 5130.3 | 3.259402795 | |
21944886 | Hand to Mouth: Living in Bootstrap America | 3.45 | 2763 | 543 | 21944886 | 2198 | 6293 | 2.863057325 | |
22232035 | All I Know Now: Wonderings and Reflections on Growing Up Gracefully | 4.2 | 4418 | 668 | 22232035 | 1414 | 4076 | 2.882602546 | |
22465597 | Vanishing Girls | 3.61 | 14435 | 2499 | 22465597 | 4969 | 15585 | 3.136445965 | |
22674531 | I Heart Robot | 3.79 | 62 | 32 | 22674531 | 43646 | 126619.9 | 2.90106539 | |
22822858 | A Little Life | 4.26 | 30388 | 6565 | 22822858 | 44606 | 119150.4 | 2.671174281 | |
23014725 | Delicate Monsters | 3.4 | 513 | 206 | 23014725 | 45903 | 138370.1 | 3.014402109 | |
23214378 | Normal | 3.4 | 2196 | 511 | 23214378 | 4400 | 14496.3 | 3.294613636 | |
23492682 | The Speechwriter: A Brief Education in Politics | 3.45 | 811 | 143 | 23492682 | 6358 | 19706.6 | 3.099496697 | |
23747672 | The Altar Girl: A Prequel | 3.54 | 2741 | 241 | 23747672 | 2024 | 5930.9 | 2.930286561 | |
24490083 | Never Sleep | 3.67 | 42 | 28 | 24490083 | 2029 | 5229.7 | 2.577476589 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
#The group_by dplyr | |
mydata = read.csv("dataset1.csv", encoding="UTF-8", header=TRUE) | |
# book_id ratingval ratingcount | |
#get average rating for each book | |
summarise(group_by(mydata, book_id), mean(ratingval, na.rm = TRUE)) | |
mydata <- mutate(mydata,total = ratingval*ratingcount) | |
mydata2 <- group_by(mydata, book_id) | |
summarise(mydata,total) | |
#sum the total votes for each book | |
totalP<-aggregate(mydata$total, by=list(book_id=mydata$book_id), FUN=sum) | |
#count the total number of votes for each book | |
totalC<-aggregate(mydata$ratingcount, by=list(book_id=mydata$book_id), FUN=sum) | |
(m1 <- merge(totalC, totalP, by.x = "book_id", by.y = "book_id")) | |
#get average rating for a cover | |
mydata <- mutate(m1,avg = x.y/x.x) | |
write.table(m1, file = "foo.csv", sep = ",", col.names = NA, | |
qmethod = "double") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment