Skip to content

Instantly share code, notes, and snippets.

Created March 1, 2018 19:54
Show Gist options
  • Save anonymous/88482173f7d66b916f206090932c7077 to your computer and use it in GitHub Desktop.
Save anonymous/88482173f7d66b916f206090932c7077 to your computer and use it in GitHub Desktop.
TuCarro.com Images Scrap
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "library(rvest)\nlibrary(dplyr)\n\n# Get some urls of relevant cars\nnpages <- 30\nfirsts <- seq(1, 48*npages, 48) # There are 48 posts per page\nurls <- c()\nfor (i in 1:npages) {\n link <- paste(\"https://carros.tucarro.com.co/carros-camionetas/_Desde_\", firsts[i], sep=\"\")\n links <- read_html(link) %>% html_nodes('div a') %>% html_attr(\"href\")\n links <- links[grep(\"MCO-\", links)] %>% unique() %>% c()\n urls <- rbind(urls, links)\n print(firsts[i])\n}\n\n# Get images links\nimages <- c()\nfor (i in 1:length(urls)) {\n link <- urls[i]\n links <- read_html(link) %>% html_nodes('.gallery-trigger , img') %>% html_attr(\"src\")\n links <- links[grep(\"-O.jpg\", links)] %>% unique() %>% c()\n images <- c(images, links)\n print(paste(i,\"out of\",length(urls)))\n}\n\n# Download images\nsetwd(\"/Users/bernardo/Dropbox (ID)/CM Data Science/Car Photos\")\nfor (i in 1:length(images)) {\n url <- as.character(images[i])\n name <- paste(\"Images/Originals/\",gsub(\".*.com/\", \"\", url), sep=\"\")\n download.file(url, name, quiet=T, mode = 'wb')\n print(paste(i,\"out of\",length(images)))\n}\n\n# Create CSV with file name and brand\nfiles <- data.frame(name = paste(\"Images/Originals/\",gsub(\".*.com/\", \"\", images), sep=\"\"))\nbrands <- c(\"kia\", \"audi\", \"bmw\", \"chevrolet\", \"renault\", \"toyota\", \"citron\", \"citroen\",\n \"ford\", \"jeep\", \"mazda\", \"mercedes-benz\", \"nissan\", \"peugeot\", \"suzuki\",\n \"volkswagen\", \"zotye\", \"willyz\", \"volvo\", \"fiat\", \"subaru\", \"ssangyong\", \n \"seat\", \"mini-cooper\", \"mg-gt\", \"lexus\", \"land-rover\", \"hyundai\", \"honda\",\n \"great-wall-wingle\", \"foton-tunland\", \"dodge\", \"faw\", \"daihatsu\")\nfiles$withbrand <- ifelse(grepl(paste(brands, collapse=\"|\"), files$name), TRUE, FALSE)\nfiles <- filter(files, withbrand == TRUE) %>% select(-withbrand)\nfor (img in 1:nrow(files)) {\n for (b in 1:length(brands)) { \n if (grepl(brands[b], files$name) == TRUE) { \n files$brand[img] = brands[b]\n print(paste(img,\"out of\",nrow(files))) \n }\n } \n}\nwrite.csv(files, \"files_brands.csv\")",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"gist": {
"id": "",
"data": {
"description": "TuCarro.com Images Scrap",
"public": true
}
},
"kernelspec": {
"name": "ir",
"display_name": "R",
"language": "R"
},
"language_info": {
"name": "R",
"codemirror_mode": "r",
"pygments_lexer": "r",
"mimetype": "text/x-r-source",
"file_extension": ".r",
"version": "3.4.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment