Created
April 8, 2019 20:43
-
-
Save stuart-warren/470928c169e271574a4ef9b1546a9444 to your computer and use it in GitHub Desktop.
Scrape car details
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/csv" | |
"fmt" | |
"log" | |
"os" | |
"regexp" | |
"strconv" | |
"strings" | |
"time" | |
"github.com/gocolly/colly" | |
) | |
// var manufacturers = []string{"audi", "bmw", "fiat", "ford", "honda", "hyundai", "kia", "mazda", "mercedes-benz", "nissan", "seat", "toyota", "vauxhall", "volkswagen", "volvo"} | |
var manufacturers = []string{"ford", "honda", "hyundai", "kia", "mazda", "nissan", "toyota", "vauxhall", "volkswagen", "volvo"} | |
func parseDate(date string) string { | |
layout := "January 2006" | |
if date == "Now" { | |
return date | |
} | |
d, err := time.Parse(layout, date) | |
if err != nil { | |
return "Unknown" | |
} | |
return strconv.Itoa(d.Year()) | |
} | |
func main() { | |
fName := "parkers_cars.csv" | |
file, err := os.Create(fName) | |
if err != nil { | |
log.Fatalf("Cannot create file %q: %s\n", fName, err) | |
return | |
} | |
defer file.Close() | |
writer := csv.NewWriter(file) | |
defer writer.Flush() | |
// Write CSV header | |
writer.Write([]string{"Make", "Model", "Type", "Spec", "From", "To", "Insurance Group", "Miles Per Tank", "Turning Circle (m)", "Length (mm)", "Width (mm)", "Transmission", "Doors", "Seats", "Luggage Capacity (Litres)", "Url"}) | |
// Instantiate default collector | |
c := colly.NewCollector( | |
colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"), | |
colly.AllowedDomains("www.parkers.co.uk"), | |
colly.URLFilters( | |
regexp.MustCompile("https://www\\.parkers\\.co\\.uk/car-specs/select-manufacturer/"), | |
regexp.MustCompile("https://www\\.parkers\\.co\\.uk/("+strings.Join(manufacturers, "|")+")/specs/$"), | |
regexp.MustCompile("https://www\\.parkers\\.co\\.uk/("+strings.Join(manufacturers, "|")+")/(.+)/specs/$"), | |
regexp.MustCompile("https://www\\.parkers\\.co\\.uk/("+strings.Join(manufacturers, "|")+")/(.+)/(.+)/specs/$"), | |
regexp.MustCompile("https://www\\.parkers\\.co\\.uk/("+strings.Join(manufacturers, "|")+")/(.+)/(.+)/(.+)/specs/$"), | |
), | |
) | |
c.Limit(&colly.LimitRule{ | |
DomainGlob: "*parkers.*", | |
Delay: 2 * time.Second, | |
RandomDelay: 3 * time.Second, | |
}) | |
// On every a element which has href attribute call callback | |
c.OnHTML("a[href]", func(e *colly.HTMLElement) { | |
link := e.Attr("href") | |
// Print link | |
// fmt.Printf("Link found: %q -> %s\n", e.Text, link) | |
// Visit link found on page | |
// Only those links are visited which are in AllowedDomains | |
c.Visit(e.Request.AbsoluteURL(link)) | |
}) | |
c.OnHTML("body", func(e *colly.HTMLElement) { | |
data := make(map[string]string) | |
URLPath := e.Request.URL.Path | |
URLParts := strings.Split(URLPath, "/") | |
if len(URLParts) != 7 { | |
return | |
} | |
fmt.Println(URLPath) | |
e.ForEach("tr", func(_ int, etr *colly.HTMLElement) { | |
data[etr.ChildText("th")] = etr.ChildText("td") | |
}) | |
writer.Write([]string{ | |
URLParts[1], URLParts[2], URLParts[3], URLParts[4], | |
parseDate(e.ChildText("span.specs-detail-page__available-dates__from")), | |
parseDate(e.ChildText("span.specs-detail-page__available-dates__to")), | |
data["Insurance group"], | |
strings.TrimSuffix(data["Miles Per Tank"], " miles"), | |
strings.TrimSuffix(data["Turning Circle"], "m"), | |
strings.TrimSuffix(data["Length"], "mm"), | |
strings.TrimSuffix(data["Width"], "mm"), | |
data["Transmission"], | |
data["Doors"], | |
data["Seats"], | |
strings.TrimSuffix(data["Luggage Capacity"], " litres"), | |
e.Request.URL.String(), | |
}) | |
writer.Flush() | |
}) | |
// Before making a request print "Visiting ..." | |
c.OnRequest(func(r *colly.Request) { | |
fmt.Println("Visiting", r.URL.String()) | |
}) | |
err = c.Visit("https://www.parkers.co.uk/car-specs/select-manufacturer/") | |
if err != nil { | |
log.Fatal(err) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment