Skip to content

Instantly share code, notes, and snippets.

@stuart-warren
Created April 8, 2019 20:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stuart-warren/470928c169e271574a4ef9b1546a9444 to your computer and use it in GitHub Desktop.
Save stuart-warren/470928c169e271574a4ef9b1546a9444 to your computer and use it in GitHub Desktop.
Scrape car details
package main
import (
"encoding/csv"
"fmt"
"log"
"os"
"regexp"
"strconv"
"strings"
"time"
"github.com/gocolly/colly"
)
// var manufacturers = []string{"audi", "bmw", "fiat", "ford", "honda", "hyundai", "kia", "mazda", "mercedes-benz", "nissan", "seat", "toyota", "vauxhall", "volkswagen", "volvo"}
var manufacturers = []string{"ford", "honda", "hyundai", "kia", "mazda", "nissan", "toyota", "vauxhall", "volkswagen", "volvo"}
func parseDate(date string) string {
layout := "January 2006"
if date == "Now" {
return date
}
d, err := time.Parse(layout, date)
if err != nil {
return "Unknown"
}
return strconv.Itoa(d.Year())
}
func main() {
fName := "parkers_cars.csv"
file, err := os.Create(fName)
if err != nil {
log.Fatalf("Cannot create file %q: %s\n", fName, err)
return
}
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
// Write CSV header
writer.Write([]string{"Make", "Model", "Type", "Spec", "From", "To", "Insurance Group", "Miles Per Tank", "Turning Circle (m)", "Length (mm)", "Width (mm)", "Transmission", "Doors", "Seats", "Luggage Capacity (Litres)", "Url"})
// Instantiate default collector
c := colly.NewCollector(
colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"),
colly.AllowedDomains("www.parkers.co.uk"),
colly.URLFilters(
regexp.MustCompile("https://www\\.parkers\\.co\\.uk/car-specs/select-manufacturer/"),
regexp.MustCompile("https://www\\.parkers\\.co\\.uk/("+strings.Join(manufacturers, "|")+")/specs/$"),
regexp.MustCompile("https://www\\.parkers\\.co\\.uk/("+strings.Join(manufacturers, "|")+")/(.+)/specs/$"),
regexp.MustCompile("https://www\\.parkers\\.co\\.uk/("+strings.Join(manufacturers, "|")+")/(.+)/(.+)/specs/$"),
regexp.MustCompile("https://www\\.parkers\\.co\\.uk/("+strings.Join(manufacturers, "|")+")/(.+)/(.+)/(.+)/specs/$"),
),
)
c.Limit(&colly.LimitRule{
DomainGlob: "*parkers.*",
Delay: 2 * time.Second,
RandomDelay: 3 * time.Second,
})
// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
// Print link
// fmt.Printf("Link found: %q -> %s\n", e.Text, link)
// Visit link found on page
// Only those links are visited which are in AllowedDomains
c.Visit(e.Request.AbsoluteURL(link))
})
c.OnHTML("body", func(e *colly.HTMLElement) {
data := make(map[string]string)
URLPath := e.Request.URL.Path
URLParts := strings.Split(URLPath, "/")
if len(URLParts) != 7 {
return
}
fmt.Println(URLPath)
e.ForEach("tr", func(_ int, etr *colly.HTMLElement) {
data[etr.ChildText("th")] = etr.ChildText("td")
})
writer.Write([]string{
URLParts[1], URLParts[2], URLParts[3], URLParts[4],
parseDate(e.ChildText("span.specs-detail-page__available-dates__from")),
parseDate(e.ChildText("span.specs-detail-page__available-dates__to")),
data["Insurance group"],
strings.TrimSuffix(data["Miles Per Tank"], " miles"),
strings.TrimSuffix(data["Turning Circle"], "m"),
strings.TrimSuffix(data["Length"], "mm"),
strings.TrimSuffix(data["Width"], "mm"),
data["Transmission"],
data["Doors"],
data["Seats"],
strings.TrimSuffix(data["Luggage Capacity"], " litres"),
e.Request.URL.String(),
})
writer.Flush()
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})
err = c.Visit("https://www.parkers.co.uk/car-specs/select-manufacturer/")
if err != nil {
log.Fatal(err)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment