Skip to content

Instantly share code, notes, and snippets.

@cavedave
Last active April 22, 2016 08:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cavedave/48cb7d25330874af6b2b21c97ae1fa9d to your computer and use it in GitHub Desktop.
Save cavedave/48cb7d25330874af6b2b21c97ae1fa9d to your computer and use it in GitHub Desktop.
mydata = read.csv("output.csv", encoding="UTF-8", header=TRUE)
#book_id title rating num_ratings num_reviews book_id x.x x.y cover_rating
attach(mydata)
plot(rating,num_reviews,main="Number of Ratings Number Reviews")
cor(num_ratings,num_reviews)
> cor(num_ratings,num_reviews)
[1] 0.9597442
> cor(rating,cover_rating)
[1] 0.1609114
cor(rating,num_ratings)
cor(rating,num_reviews)
cor(num_ratings,cover_rating)
cor(num_reviews,cover_rating)
> cor(rating,cover_rating)
[1] 0.1609114
> cor(rating,cover_rating)
[1] 0.1609114
> cor(book_id,book_id2)
[1] 1
> cor(rating,num_ratings)
[1] 0.2141307
> cor(rating,num_reviews)
[1] 0.2658916
> cor(num_ratings,cover_rating)
[1] 0.3059627
> cor(num_reviews,cover_rating)
[1] 0.3307553
plot(rating,cover_rating,main="Judging a book by its cover")
library(ggplot2)
p1 <- ggplot(mydata, aes(x = rating, y = cover_rating))
p1 <- p1 + labs(x="Avg Cover Rating",
y = "Avg Book Rating")
p1 <- p1 + ggtitle("Can you Judge a Book by its Cover?") +
theme(plot.title = element_text(lineheight=.8, face="bold"))
p1 <-p1 + annotate("text", x = 3, y = 1.5, label = "Correlation book and cover ratings is 0.16")
p1 <-p1 + geom_point()
p1
ggsave("Cover.png", width=10, height=10, dpi=300)
require 'open-uri'
require 'nokogiri'
array = [1301,2165,2657,5246,7672,10210,14497,18342,19501,22034,22628,22635,33600,41865,43035,80467,105578,201114,297673,405784,439965,468328,469410,471381,505576,581332,825139,898885,1576816,1850833,1868343,1971304,1988626,2767052,3189111,3565645,4312691,4922079,5632446,6263078,6442769,6609765,6691280,6837103,6883046,7031817,7381740,8135213,8331609,8491980,8694331,8697395,9627849,9661374,9778945,10308500,10644930,10866398,12032357,12291438,12875355,13449852,13579626,13589182,15793049,15797397,15994634,16131077,16158542,16278318,17255444,17333230,17737039,18050053,18077769,18225810,18296030,18302455,18310201,18652002,18667945,19542841,20263206,20575425,20706269,20821614,20892558,20897517,20980987,21416678,21944886,22232035,22465597,22674531,22822858,23014725,23214378,23492682,23747672,24490083]
array.each { |x|
# missing 22635,22034,22628
print x.to_s+", "
doc = Nokogiri::HTML(open("https://www.goodreads.com/book/show/"+(x).to_s))
doc.xpath('//h1').each do |node|
node=node.to_str.strip
print node+", "
end
#<span class="average" itemprop="ratingValue">4.23</span>
ratings = doc.css('.itemprop')
ratings.each do |rating|
print rating.text+", "
end
#<span class="average" itemprop="ratingValue">4.23</span>
ratings = doc.css('.average')
ratings.each do |rating|
print rating.text+", "
end
#<span class="value-title" title="62333" itemprop="ratingCount">62,333 Ratings</span>
counts = doc.css('.value-title')
counts.each do |count|
print count.text+", "
end
puts ""
sleep(5)
}
book_id title rating num_ratings num_reviews book_idCover num_rating total_rating cover_rating
1301 Moneyball: The Art of Winning an Unfair Game 4.23 62333 3613 1301 5248 16110.3 3.069798018
2165 The Old Man and the Sea 3.69 459620 13320 2165 1569 5643 3.596558317
2657 To Kill a Mockingbird 4.24 2807869 62159 2657 1564 6320.1 4.040984655
5246 Ethan Frome 3.32 67479 3644 5246 2289 7615.3 3.326911315
7672 Congo 3.52 120930 1399 7672 1998 5984.7 2.995345345
10210 Jane Eyre 4.08 1135632 26931 10210 1977 6983.2 3.532220536
14497 Neverwhere 4.16 246191 12500 14497 4363 15036.5 3.446367179
18342 It 4.14 381656 8254 18342 2021 6742.1 3.336021771
19501 Eat Pray Love 3.48 1072053 46874 19501 2764 9088.3 3.288096961
22034 The Godfather (Mario Puzo's Mafia) 4.35 241113 4787 22034 2791 10563.9 3.78498746
22628 The Perks of Being a Wallflower 4.2 813120 41549 22628 1641 5353.4 3.262279098
22635 Darwin on Trial 3.89 1199 73 22635 47444 133162.5 2.80673004
33600 Shantaram 4.25 90669 8927 33600 2425 7815.7 3.222969072
41865 Twilight 3.56 3385599 90011 41865 1874 5351.1 2.855442903
43035 White Fang 3.91 96681 2473 43035 2064 7071.2 3.425968992
80467 Him Her Him Again the End of Him 2.85 1541 378 80467 2833 8920.9 3.148923403
105578 One Night at the Call Center 2.45 38228 1090 105578 5103 14454.1 2.832471095
201114 Sold 4.16 32631 4071 201114 2075 6974 3.360963855
297673 The Brief Wondrous Life of Oscar Wao 3.87 154969 14842 297673 1942 6419.6 3.305664264
405784 Fangland 2.62 871 176 405784 5013 13938.9 2.780550569
439965 The Twins of Tribeca 2.89 457 51 439965 2629 6735.8 2.562114873
468328 Notting Hell 2.83 693 112 468328 50292 119860.6 2.383293566
469410 Urien’s Voyage 3.4 75 9 469410 1730 5492.6 3.174913295
471381 You Have to Kiss a Lot of Frogs 2.84 974 86 471381 4829 12179.6 2.522178505
505576 Top Girls 3.55 3784 127 505576 2537 6785 2.674418605
581332 The Third Translation 2.53 367 60 581332 3000 9122.1 3.0407
825139 Wedding Season 2.82 661 94 825139 1943 4908 2.525990736
898885 Loving Frank 3.71 69363 8400 898885 2979 9040 3.034575361
1576816 American Nerd: The Story of My People 2.98 1313 316 1576816 2290 7465.2 3.259912664
1850833 Beautiful Children 2.91 1681 408 1850833 1553 4762.7 3.066773986
1868343 The Spa 2.81 381 73 1868343 2555 6632 2.595694716
1971304 City of Thieves 4.27 63789 7230 1971304 2455 8336 3.395519348
1988626 The Pig Did It 2.87 1183 316 1988626 2758 7516.2 2.725235678
2767052 The Hunger Games 4.36 4147206 147596 2767052 1779 6376.5 3.584317032
3189111 Handcuffs 3.46 418 87 3189111 3149 7366.2 2.3392188
3565645 Collections of Nothing 2.94 183 61 3565645 4699 14890 3.16875931
4312691 Enclave 2.75 186 42 4312691 4094 12331 3.011968735
4922079 One Second After 3.92 25289 3767 4922079 2257 6513.6 2.885954807
5632446 Columbine 4.25 40368 5253 5632446 1580 5465.6 3.459240506
6263078 Miles to Go 3.5 7892 845 6263078 2028 3666.8 1.808086785
6442769 Paper Towns 3.93 569295 38093 6442769 3489 12326.4 3.532932072
6609765 Out of My Mind 4.36 53538 7888 6609765 3055 9629.9 3.152176759
6691280 Mr. Darcy Vampyre 2.89 4287 799 6691280 1508 4240.7 2.812135279
6837103 The Kitchen House 4.16 144493 13490 6837103 2941 8892.1 3.02349541
6883046 Shadow Tag 3.35 5118 1218 6883046 2056 6535.8 3.178891051
7031817 Fat Vampire: A Never Coming of Age Story 2.86 2272 591 7031817 48576 115897.5 2.385900445
7381740 The Rehearsal 3.4 2934 491 7381740 1792 5485.2 3.0609375
8135213 Justin Bieber: His World 4.44 778 36 8135213 1795 2689.4 1.498272981
8331609 Alison Wonderland 2.43 1313 282 8331609 51249 125816 2.454994244
8491980 People Who Eat Darkness: The Fate of Lucie Blackman 3.73 7784 1037 8491980 1762 4813.5 2.73183882
8694331 Waiting to Live 4.45 56 27 8694331 4742 11296.5 2.382222691
8697395 Cleaving: A Story of Marriage Meat and Obsession 2.4 3695 964 8697395 42372 129682.9 3.0605801
9627849 Games to Play After Dark 2.69 257 70 9627849 3232 9542.3 2.952444307
9661374 Lunch-Box Dream 2.69 249 72 9661374 1690 5315.8 3.145443787
9778945 The Shallows: What the Internet Is Doing to Our Brains 3.83 12075 1764 9778945 2812 8354.2 2.970910384
10308500 The Tennis Party 2.82 3108 363 10308500 1412 3624.9 2.567209632
10644930 11/22/63 4.27 237246 26385 10644930 4551 15537.8 3.414150736
10866398 A Study in Sherlock: Stories Inspired by the Holmes Canon 3.75 2726 346 10866398 1464 4936.4 3.371857923
12032357 Domingo's Angel 4.62 68 36 12032357 52315 93514.4 1.787525566
12291438 The Madman’s Daughter 3.7 16486 2904 12291438 5346 15793.3 2.95422746
12875355 Death Comes to Pemberley 3.16 64819 8138 12875355 4772 15058.7 3.155637049
13449852 17 3.54 212 10 13449852 2729 7822.6 2.866471235
13579626 Rooms 3.3 8771 1439 13579626 2716 9408.3 3.464027982
13589182 Mastery 4.24 9567 655 13589182 2313 6712.8 2.902204929
15793049 Farside 3.11 942 179 15793049 3268 10571.4 3.234822521
15797397 Salt Sugar Fat: How the Food Giants Hooked Us 3.96 17657 2307 15797397 5141 14400.8 2.801167088
15994634 Z: A Novel of Zelda Fitzgerald 3.8 29527 3352 15994634 4003 14169.3 3.539670247
16131077 The Shining Girls 3.48 28152 3996 16131077 1823 5732.3 3.144432255
16158542 The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics 4.3 91671 11798 16158542 2467 7658.6 3.104418322
16278318 Armada 3.46 35747 6563 16278318 3194 9626.3 3.013869756
17255444 The Orpheus Descent 3.22 295 67 17255444 2418 7655.7 3.166129032
17333230 The Luminaries 3.68 40872 5801 17333230 5686 18447.3 3.244336968
17737039 Happier at Home: How I Learned to Pay Attention Cram My Day with What I Love Hold More Tightly Embrace Here and Remember Now 3.46 10116 1419 17737039 4802 13354.5 2.781028738
18050053 The Moon Sisters 3.53 1196 326 18050053 4572 13179 2.882545932
18077769 Authority 3.58 14963 1970 18077769 2062 6504.9 3.154655674
18225810 Sisters 4.28 18554 2210 18225810 3333 8970.5 2.691419142
18296030 Don't Look Back 4.11 17639 2711 18296030 1970 5789.9 2.939035533
18302455 The Circle 3.44 75782 11068 18302455 2812 8921.9 3.172795164
18310201 The Painter 3.76 8664 1196 18310201 3750 11734 3.129066667
18652002 The Bees 3.66 14258 2760 18652002 2346 7462.8 3.181074169
18667945 #GIRLBOSS 3.72 23047 1811 18667945 2611 6962 2.666411337
19542841 More Happy Than Not 4.1 5760 1558 19542841 1425 4282 3.004912281
20263206 The Buried Life 3.29 319 95 20263206 3652 11512 3.152245345
20575425 Wolf in White Van 3.73 12143 1868 20575425 49512 154889.5 3.128322427
20706269 Broken Monsters 3.6 10362 1663 20706269 2887 8655 2.997921718
20821614 You 3.82 22806 4712 20821614 2443 7032.7 2.878714695
20892558 Lamentation 4.35 5029 645 20892558 2850 8238.8 2.890807018
20897517 In the Kingdom of Ice: The Grand and Terrible Polar Voyage of the USS Jeannette 4.15 10639 1483 20897517 2153 6594.3 3.062842545
20980987 The Art of Asking; or How I Learned to Stop Worrying and Let People Help 3.94 17979 1801 20980987 2678 8057.7 3.008849888
21416678 The Bishop’s Wife 3.09 2404 623 21416678 1574 5130.3 3.259402795
21944886 Hand to Mouth: Living in Bootstrap America 3.45 2763 543 21944886 2198 6293 2.863057325
22232035 All I Know Now: Wonderings and Reflections on Growing Up Gracefully 4.2 4418 668 22232035 1414 4076 2.882602546
22465597 Vanishing Girls 3.61 14435 2499 22465597 4969 15585 3.136445965
22674531 I Heart Robot 3.79 62 32 22674531 43646 126619.9 2.90106539
22822858 A Little Life 4.26 30388 6565 22822858 44606 119150.4 2.671174281
23014725 Delicate Monsters 3.4 513 206 23014725 45903 138370.1 3.014402109
23214378 Normal 3.4 2196 511 23214378 4400 14496.3 3.294613636
23492682 The Speechwriter: A Brief Education in Politics 3.45 811 143 23492682 6358 19706.6 3.099496697
23747672 The Altar Girl: A Prequel 3.54 2741 241 23747672 2024 5930.9 2.930286561
24490083 Never Sleep 3.67 42 28 24490083 2029 5229.7 2.577476589
library(dplyr)
#The group_by dplyr
mydata = read.csv("dataset1.csv", encoding="UTF-8", header=TRUE)
# book_id ratingval ratingcount
#get average rating for each book
summarise(group_by(mydata, book_id), mean(ratingval, na.rm = TRUE))
mydata <- mutate(mydata,total = ratingval*ratingcount)
mydata2 <- group_by(mydata, book_id)
summarise(mydata,total)
#sum the total votes for each book
totalP<-aggregate(mydata$total, by=list(book_id=mydata$book_id), FUN=sum)
#count the total number of votes for each book
totalC<-aggregate(mydata$ratingcount, by=list(book_id=mydata$book_id), FUN=sum)
(m1 <- merge(totalC, totalP, by.x = "book_id", by.y = "book_id"))
#get average rating for a cover
mydata <- mutate(m1,avg = x.y/x.x)
write.table(m1, file = "foo.csv", sep = ",", col.names = NA,
qmethod = "double")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment