Skip to content

Instantly share code, notes, and snippets.

@soypat
Last active April 18, 2022 21:14
Show Gist options
  • Save soypat/3728d463adeafe282dc9ef757cc59f2f to your computer and use it in GitHub Desktop.
Save soypat/3728d463adeafe282dc9ef757cc59f2f to your computer and use it in GitHub Desktop.
Get SYC component data using a web scraper
package main
import (
"fmt"
"log"
"strconv"
"strings"
wd "github.com/fedesog/webdriver"
)
const (
sycHost = "www.sycelectronica.com.ar"
mlHost = "articulo.mercadolibre.com.ar"
)
type article struct {
Title string
Category string
SKU string
USD float64
// Tax as a percent. i.e 21.5 is 21.5%
TaxPercent float64
Availability string
Image string
URL string
}
func (a article) excelify(quantity int) string {
return fmt.Sprintf("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
quantity, a.Title, a.URL, a.SKU, a.usdString(), a.ivaString(), a.Category, a.Availability)
}
func (a article) usdString() string {
return strings.Replace(fmt.Sprintf("%.3f", a.USD), ".", Decimal, 1)
}
func (a article) ivaString() string {
return strings.Replace(fmt.Sprintf("%.1f", a.TaxPercent), ".", Decimal, 1)
}
func getMLArticle(s *wd.Session) (a article) {
pesoPrice := queryText(s, `span.andes-money-amount__fraction`)
pesos, err := strconv.ParseFloat(strings.ReplaceAll(pesoPrice, ",", "."), 64)
if err != nil {
log.Println("could not parse ML price: ", pesoPrice)
return a
}
a.USD = pesos / USD2Pesos
a.Title = queryText(s, `h1.ui-pdp-title`)
a.Availability = queryText(s, `span.ui-pdp-buybox__quantity__available`)
a.Category = queryText(s, `li > a.andes-breadcrumb__link`)
a.SKU = getMLSKU(s)
a.URL, _ = s.GetUrl()
return a
}
func getMLSKU(s *wd.Session) string {
el, err := query(s, `div.ui-pdp-bookmark__link-bookmark > form`)
if err != nil {
log.Println("unable to find MLSKU element")
return ""
}
v, err := el.GetAttribute("action")
if err != nil {
log.Println("could not find attribute for MLSKU element")
return ""
}
a := strings.Split(v, "/")
if len(a) < 4 {
log.Println("MLSKU attribute could not be parsed")
return ""
}
return a[3]
}
func getSYCArticle(s *wd.Session) (a article) {
_, err := query(s, `div.product_description`)
if err != nil {
return a
}
a.Category = queryText(s, `div.product_category`)
a.SKU = queryText(s, `div.product_name`)
a.Title = queryText(s, `div.product_text`)
a.Availability = queryText(s, `div[property=availability] > div`)
price := queryText(s, `div.product_price`)
nums := ParseNumbers(strings.ReplaceAll(price, ",", "."))
if len(nums) < 2 || nums[0] <= 0 {
log.Printf("got bad price line %q. nums:%g\n", price, nums)
} else {
a.USD = nums[0]
a.TaxPercent = nums[1]
}
img, err := query(s, `div.image_selected > a > img`)
if err == nil {
a.Image, _ = img.GetAttribute("src")
}
a.URL, _ = s.GetUrl()
return a
}
// ParseNumbers reads all numbers from a string and returns
// them in the order they are found as floats.
func ParseNumbers(s string) (nums []float64) {
start := -1
for i, c := range s {
isNum := isNumRune(c)
if (isNum || c == '.') && start < 0 {
// start of a number found.
start = i
if i > 0 && (s[i-1] == '-' || s[i-1] == '.') {
// back up one place if number negative or if number is decimal.
start--
}
}
isTok := isFloatRune(c)
if (start >= 0 && !isTok) || (i == len(s)-1 && start >= 0) {
if isNum {
// Include number if at end of string.
i++
}
num, err := strconv.ParseFloat(s[start:i], 64)
if err == nil {
nums = append(nums, num)
}
start = -1 // reset start to begin looking for new number.
}
}
return nums
}
func isNumRune(r rune) bool { return r^'0' < 10 }
func isFloatRune(r rune) bool {
return isNumRune(r) || r == '.' || r == 'E' || r == '+' || r == '-' || r == 'e'
}
// Install chromedriver at https://chromedriver.chromium.org/home
package main
import (
"bufio"
"log"
"net/url"
"os"
"strconv"
"strings"
wd "github.com/fedesog/webdriver"
)
const (
platform = "Linux"
urlStart = "https://www.sycelectronica.com.ar/"
// chromedriver required. Get it at https://chromedriver.chromium.org/home
driverPath = "/home/pato/local/bin/chromedriver"
Decimal = ","
USD2Pesos = 200.0
)
func main() {
driver, err := openBrowser()
if err != nil {
log.Fatal(err)
}
fp, err := os.OpenFile("articles.tsv", os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0644)
if err != nil {
log.Fatal(err)
}
defer fp.Close()
var quantity int
scanner := bufio.NewScanner(os.Stdin)
log.Println("Navigate to article and enter quantity of articles desired:")
for scanner.Scan() {
txt := scanner.Text()
quantity, err = strconv.Atoi(txt)
if err != nil {
if err := scanner.Err(); err != nil {
log.Fatal(err)
} else {
log.Println("invalid input. must be integer")
}
continue
}
var art article
host := getHost(driver)
switch host {
case sycHost:
art = getSYCArticle(driver)
case mlHost:
art = getMLArticle(driver)
default:
log.Println("unknown host or got error at ", host)
continue
}
if art.SKU == "" {
log.Println("could not get article SKU. Are you on a article page?")
continue
}
log.Println("writing", quantity, art.Title)
_, err = fp.WriteString(art.excelify(quantity) + "\n")
if err != nil {
log.Fatal(err)
}
}
}
func openBrowser() (*wd.Session, error) {
//driverPath, _ := studentCmd.PersistentFlags().GetString("driver")
chromeDriver := wd.NewChromeDriver(driverPath)
err := chromeDriver.Start()
if err != nil {
return &wd.Session{}, err
}
var session *wd.Session
desired := wd.Capabilities{"Platform": platform}
required := wd.Capabilities{"Platform": platform}
session, err = chromeDriver.NewSession(desired, required)
if err != nil {
return session, err
}
err = session.Url(urlStart)
if err != nil {
log.Fatal(err)
}
return session, nil
}
func queryText(s *wd.Session, querySelector string) string {
el, err := query(s, querySelector)
if err != nil {
log.Println("queryText failed:", err)
return ""
}
str, _ := el.Text()
return strings.ReplaceAll(str, "\t", "") // eliminate tabular separators
}
func querys(s *wd.Session, querySelector string) ([]wd.WebElement, error) {
return s.FindElements(wd.FindElementStrategy("css selector"), querySelector)
}
func query(s *wd.Session, querySelector string) (wd.WebElement, error) {
return s.FindElement(wd.FindElementStrategy("css selector"), querySelector)
}
func attrQuery(s *wd.Session, attrName, querySelector string) string {
e, err := query(s, querySelector)
if err != nil {
return ""
}
attribute, err := e.GetAttribute(attrName)
if err != nil {
return ""
}
return attribute
}
func mouseClickSelector(s *wd.Session, querySelector string) (*wd.Session, error) {
var m wd.MouseButton
elem, err := query(s, querySelector) // button selector
if err != nil {
return s, err
}
err = s.MoveTo(elem, 0, 0)
if err != nil {
return s, err
}
return s, s.Click(m)
}
func getHost(s *wd.Session) string {
a, _ := s.GetUrl()
URL, err := url.Parse(a)
if err != nil {
return ""
}
return URL.Host
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment