Skip to content

Instantly share code, notes, and snippets.

@Irio
Last active July 31, 2023 20:20
Show Gist options
  • Star 14 You must be signed in to star a gist
  • Fork 10 You must be signed in to fork a gist
  • Save Irio/3da6ee4dea8cad6613c1337a15044f09 to your computer and use it in GitHub Desktop.
Save Irio/3da6ee4dea8cad6613c1337a15044f09 to your computer and use it in GitHub Desktop.
GCP Serverless scrapers
#!/bin/bash
set -e
eval "$(jq -r '@sh "PROJECT=\(.project) REGION=\(.region)"')"
while true
do
URL=$(gcloud beta run services describe wohnung \
--platform managed \
--project $PROJECT \
--region $REGION \
--format json | jq --raw-output '.status.url // empty')
if [ ! -z "$URL" ]
then
break
fi
sleep 5
done
echo "{\"url\": \"$URL\"}"
variable "bucket" {
description = "Google Cloud Storage bucket name"
}
variable "project" {
description = "Google Cloud project ID"
}
variable "region" {
description = "Google Cloud region"
}
variable "cloud_source_repository" {
description = "Google Cloud Source repository name"
}
variable "zone" {
description = "Google Cloud zone, part of the provided region"
}
variable "crontab_schedule" {
description = "Crontab schedule for running scrapers"
}
locals {
container_tag = "gcr.io/${var.project}/wohnung:latest"
}
provider "google" {
credentials = "${file("credentials.json")}"
project = "${var.project}"
region = "${var.region}"
zone = "${var.zone}"
}
resource "google_storage_bucket" "items" {
name = "${var.bucket}"
location = "US"
}
resource "google_app_engine_application" "app" {
project = "${var.project}"
location_id = "${var.region}"
}
resource "google_cloud_scheduler_job" "job" {
name = "run-scrapers"
description = "Trigger scrapers"
schedule = "${var.crontab_schedule}"
time_zone = "Etc/UTC"
http_target {
http_method = "POST"
uri = "${data.external.google_cloud_run_service.result.url}/"
}
depends_on = ["google_app_engine_application.app"]
}
resource "google_cloudbuild_trigger" "default" {
trigger_template {
branch_name = "master"
repo_name = "${var.cloud_source_repository}"
}
substitutions = {
_BUCKET = "${var.bucket}"
}
build {
images = ["${local.container_tag}"]
step {
name = "gcr.io/cloud-builders/docker"
args = ["build", "-t", "${local.container_tag}", "."]
}
step {
name = "gcr.io/cloud-builders/docker"
args = ["push", "${local.container_tag}"]
}
step {
name = "gcr.io/cloud-builders/gcloud"
args = [
"beta", "run", "deploy", "wohnung",
"--region", "${var.region}",
"--image", "${local.container_tag}",
"--update-env-vars", "GCLOUD_BUCKET=$${_BUCKET}",
"--memory", "1Gi",
"--timeout", "10m",
"--platform", "managed",
"--allow-unauthenticated",
]
}
}
provisioner "local-exec" {
command = "bash trigger_build.sh"
}
}
data "external" "google_cloud_run_service" {
depends_on = ["google_cloudbuild_trigger.default"]
program = ["bash", "get_service_url.sh"]
query = {
project = "${var.project}"
region = "${var.region}"
}
}
bucket = "realestate-berlin-254211-items"
cloud_source_repository = "github_irio_wohnung"
crontab_schedule = "44 8 * * *"
project = "realestate-berlin-254211"
region = "us-east1"
zone = "us-east1-b"
#!/bin/bash
set -e
TRIGGER_ID=$(gcloud alpha builds triggers list \
--filter='triggerTemplate.repoName:github_irio_wohnung' \
--format json | jq --raw-output '.[0].id')
gcloud alpha builds triggers run $TRIGGER_ID \
--branch master > /dev/null
FROM golang:1.12 as build
WORKDIR $GOPATH/src/github.com/Irio/wohnung
COPY scraper scraper
COPY main.go .
RUN go get -d -v ./...
RUN go install
FROM gcr.io/distroless/base
COPY --from=build /go/bin/wohnung /
CMD ["/wohnung"]
package main
import (
"fmt"
"log"
"net/http"
"os"
scraper "github.com/Irio/wohnung/scraper"
)
func main() {
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
err := r.ParseForm()
if err != nil {
log.Println(err)
}
if r.Method == http.MethodPost {
fmt.Fprintln(w, scraper.Run(r.Form))
}
})
port := os.Getenv("PORT")
if port == "" {
port = "8080"
}
log.Fatal(http.ListenAndServe(":"+port, nil))
}
package cloudrun
import (
"time"
"github.com/gocolly/colly"
)
type EBayKleinanzeigen struct{}
func (EBayKleinanzeigen) parseItem(e *colly.XMLElement) Item {
selector := "//*[contains(@class, 'ad-listitem')]//*[contains(@class, 'aditem-main')]//a"
title := e.ChildText(selector)
url := e.ChildAttr(selector, "href")
locationNodes := e.ChildTexts("//*[contains(@class, 'aditem-details')]//text()")
var location string
if len(locationNodes) > 8 {
location = locationNodes[6] + " " + locationNodes[8]
}
priceString := e.ChildText("//*[contains(@class, 'aditem-details')]//strong")
price, _ := parsePrice(priceString)
spaceString := e.ChildText("//*[contains(@class, 'text-module-end')]//*[contains(text(), 'm²')]")
livingSpace, _ := parseSpace(spaceString)
roomsString := e.ChildText("//*[contains(@class, 'text-module-end')]//*[contains(text(), 'Zimmer')]")
rooms, _ := parseFloat(roomsString, " Zimmer")
return Item{
title: title,
location: location,
hasExactLocation: false,
price: price,
livingSpace: livingSpace,
rooms: rooms,
url: e.Request.AbsoluteURL(url),
scrapedAt: time.Now().UTC(),
}
}
func (platform EBayKleinanzeigen) NewCollector(config Config) *colly.Collector {
options := append(
config.collectorOptions,
colly.AllowedDomains("www.ebay-kleinanzeigen.de"))
return colly.NewCollector(options...)
}
func (platform EBayKleinanzeigen) crawl(config Config, exporter Exporter) *colly.Collector {
c := platform.NewCollector(config)
c.OnXML("//*[contains(@class, 'ad-listitem')]", func(e *colly.XMLElement) {
item := platform.parseItem(e)
exporter.write(item)
})
c.OnXML("//a[contains(@class, 'pagination-next')]", func(e *colly.XMLElement) {
url := e.Request.AbsoluteURL(e.Attr("href"))
c.Visit(url)
})
c.Visit("https://www.ebay-kleinanzeigen.de/s-wohnung-mieten/berlin/c203l3331")
return c
}
package cloudrun
import (
"encoding/csv"
"log"
"os"
"path"
"reflect"
"github.com/gocolly/colly"
)
type Exporter interface {
write(record Item) error
}
type CSVExporter struct {
writer *csv.Writer
fileName string
}
func (exp CSVExporter) write(record Item) error {
return exp.writer.Write(record.csvRow())
}
func (CSVExporter) fields() []string {
val := reflect.ValueOf(&Item{}).Elem()
names := make([]string, val.NumField())
for i := 0; i < val.NumField(); i++ {
names[i] = val.Type().Field(i).Name
}
return names
}
func (exp CSVExporter) run(config Config, fn func(Config, Exporter) *colly.Collector) {
os.MkdirAll(path.Dir(exp.fileName), 755)
file, err := os.Create(exp.fileName)
if err != nil {
log.Fatalf("Cannot create file %q: %s\n", exp.fileName, err)
return
}
defer file.Close()
exp.writer = csv.NewWriter(file)
defer exp.writer.Flush()
exp.writer.Write(exp.fields())
collector := fn(config, exp)
log.Printf("Scraping finished, check file %q for results\n", exp.fileName)
log.Println(collector)
}
package cloudrun
import (
"log"
"net/url"
"os"
"path"
"reflect"
"strconv"
"strings"
"time"
"github.com/gocolly/colly"
)
type Platform interface {
crawl(config Config, writer Exporter) *colly.Collector
}
type Item struct {
title string
location string
hasExactLocation bool
price int
livingSpace float64
rooms float64
url string
scrapedAt time.Time
}
type Config struct {
dataDir string
platforms []Platform
storage Storage
collectorOptions []colly.CollectorOption
}
func (record Item) csvRow() []string {
return []string{
record.title,
record.location,
strconv.FormatBool(record.hasExactLocation),
strconv.Itoa(record.price),
strconv.FormatFloat(record.livingSpace, 'f', -1, 64),
strconv.FormatFloat(record.rooms, 'f', -1, 64),
record.url,
record.scrapedAt.Format(time.RFC3339),
}
}
func readConfig(params url.Values) Config {
available := map[string]Platform{
"ebay_kleinanzeigen": EBayKleinanzeigen{},
"immobilien_scout": ImmobilienScout{},
"immowelt": Immowelt{},
"nestpick": Nestpick{},
}
platforms := make([]Platform, 0)
for name := range available {
platforms = append(platforms, available[name])
}
cache := params.Get("cache") == "1"
var collectorOptions []colly.CollectorOption
if cache {
collectorOptions = append(collectorOptions, colly.CacheDir("cache"))
}
platform := params.Get("platform")
if platform != "" {
platforms = []Platform{available[platform]}
}
bucket, isDefined := os.LookupEnv("GCLOUD_BUCKET")
if !isDefined {
log.Fatalln("GCLOUD_BUCKET must be defined")
}
date := time.Now().UTC().Format(time.RFC3339)
storage := GCloudStorage{
bucket: bucket,
destinationPath: date + "/",
}
return Config{
dataDir: "/tmp/wohnung",
platforms: platforms,
storage: storage,
collectorOptions: collectorOptions,
}
}
func Run(params url.Values) string {
config := readConfig(params)
for _, platform := range config.platforms {
fileName := strings.Split(reflect.TypeOf(platform).String(), ".")[1]
fileName = path.Join(config.dataDir, fileName+".csv")
exporter := CSVExporter{fileName: fileName}
exporter.run(config, platform.crawl)
config.storage.write(fileName)
}
return "it works"
}
package cloudrun
import (
"strconv"
"strings"
)
func parsePrice(valueStr string) (int, error) {
value, err := parseFloat(valueStr, " €")
return int(value * 100), err
}
func parseSpace(value string) (float64, error) {
return parseFloat(value, " m²")
}
func parseFloat(valueStr string, unit string) (float64, error) {
replacer := strings.NewReplacer(",", ".", ".", "", unit, "")
return strconv.ParseFloat(replacer.Replace(valueStr), 64)
}
package cloudrun
import (
"context"
"io"
"log"
"os"
"path"
"path/filepath"
cloud_storage "cloud.google.com/go/storage"
)
type Storage interface {
write(filePath string)
}
type GCloudStorage struct {
bucket string
destinationPath string
}
func (storage GCloudStorage) write(source string) {
var r io.Reader
f, err := os.Open(source)
if err != nil {
log.Fatal(err)
}
defer f.Close()
r = f
ctx := context.Background()
if err = storage.upload(ctx, r, source); err != nil {
log.Fatal(err)
}
}
func (storage GCloudStorage) upload(ctx context.Context, r io.Reader, source string) error {
client, err := cloud_storage.NewClient(ctx)
if err != nil {
return err
}
bh := client.Bucket(storage.bucket)
name := path.Join(storage.destinationPath, filepath.Base(source))
obj := bh.Object(name)
w := obj.NewWriter(ctx)
if _, err := io.Copy(w, r); err != nil {
return err
}
if err := w.Close(); err != nil {
return err
}
return nil
}
@benkoshy
Copy link

Awesome work. I'm not understanding exactly what it's doing, but it looks very nice.

@denniskribl
Copy link

interesting choice of tech used. nice work :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment