-
-
Save chirayukong/f45fe9e40470f6f2d3775c51ab8a0130 to your computer and use it in GitHub Desktop.
scrapeGoogleImages_file1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var url ='https://www.google.de/search?q=Yahoo+logo&source=lnms&tbm=isch&sa=X'; | |
var page = new WebPage() | |
var fs = require('fs'); | |
var vWidth = 1080; | |
var vHeight = 1920; | |
page.viewportSize = { | |
width: vWidth , | |
height: vHeight | |
}; | |
//Scroll throu! | |
var s = 0; | |
var sBase = page.evaluate(function () { return document.body.scrollHeight; }); | |
page.scrollPosition = { | |
top: sBase, | |
left: 0 | |
}; | |
function sc() { | |
var sBase2 = page.evaluate(function () { return document.body.scrollHeight; }); | |
if (sBase2 != sBase) { | |
sBase = sBase2; | |
} | |
if (s> sBase) { | |
page.viewportSize = {width: vWidth, height: vHeight}; | |
return; | |
} | |
page.scrollPosition = { | |
top: s, | |
left: 0 | |
}; | |
page.viewportSize = {width: vWidth, height: s}; | |
s += Math.min(sBase/20,400); | |
setTimeout(sc, 110); | |
} | |
function just_wait() { | |
setTimeout(function() { | |
fs.write('1.html', page.content, 'w'); | |
phantom.exit(); | |
}, 2500); | |
} | |
page.open(url, function (status) { | |
sc(); | |
just_wait(); | |
}); | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(plyr) | |
library(reshape2) | |
require(rvest) | |
scrapeJSSite <- function(searchTerm){ | |
url <- paste0("https://www.google.de/search?q=",searchTerm, "&source=lnms&tbm=isch&sa=X") | |
lines <- readLines("imageScrape.js") | |
lines[1] <- paste0("var url ='", url ,"';") | |
writeLines(lines, "imageScrape.js") | |
## Download website | |
system("phantomjs imageScrape.js") | |
pg <- read_html("1.html") | |
files <- pg %>% html_nodes("img") %>% html_attr("src") | |
df <- data.frame(images=files, search=searchTerm) | |
return(df) | |
} | |
downloadImages <- function(files, brand, outPath="images"){ | |
for(i in 1:length(files)){ | |
download.file(files[i], destfile = paste0(outPath, "/", brand, "_", i, ".jpg"), mode = 'wb') | |
} | |
} | |
### exchange the search terms here! | |
gg <- scrapeJSSite(searchTerm = "Adidas+logo") | |
downloadImages(as.character(gg$images), i) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment