Skip to content

Instantly share code, notes, and snippets.

@notsobad
Created July 22, 2018 12:30
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save notsobad/e24d6f9fc6f5e54ff91415b93466d50b to your computer and use it in GitHub Desktop.
Save notsobad/e24d6f9fc6f5e54ff91415b93466d50b to your computer and use it in GitHub Desktop.
Page parser using headless chrome.
const puppeteer = require('puppeteer');
const url = process.argv[2];
//console.log(url);
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, {waitUntil: 'networkidle0'});
await page.waitFor(1500);
const info = await page.evaluate(() => {
let products = [];
let blocks = document.querySelectorAll('div.product-list.m-bg-white > ul > li');
for(let i=0; i<blocks.length; i++){
console.log(i, blocks[i])
let name = blocks[i].querySelector('.view-info > h1 > a');
let vendor = blocks[i].querySelector('.view-info > div.line.cfg.mt6 > a');
let price = blocks[i].querySelector('.view-price > div > span');
let rate = blocks[i].querySelector('.view-desc > div.line.info.mt6 > div > i');
let sold = blocks[i].querySelector('.view-desc > div.line.cfg.mt6');
let product = {
'name': name && name.innerText,
'vendor': vendor && vendor.innerText,
'price': price && price.innerText,
'sold': sold && sold.innerText,
'rate': rate && rate.style.width
};
console.log(i, product);
products.push(product);
}
return {
'products' : products
};
});
info['url'] = url;
console.log(JSON.stringify(info));
//console.log(info);
await browser.close();
})();
#coding=utf8
import json
import os
import sys
import csv
from mycsv import UnicodeWriter
csv_out = open('out.csv', 'wb')
fields = ['name', 'vendor', 'price', 'sold', 'rate']
#writer = csv.DictWriter(csv_out, fieldnames=fields, encoding='utf-8')
writer = UnicodeWriter(csv_out, delimiter="\t")
for i in range(1, 466):
name = './out/%s.json' % i
if not os.path.isfile(name):
print "%s not exists" % name
continue
obj = json.load(open(name))
for p in obj['products']:
row = [p[n] or '' for n in fields]
#row = [p['name'], p['vendor'], p['price'], p['sold'], p['rate']]
writer.writerow(row)
print p
csv_out.close()
for i in {9..465};do
timeout 35 node x3.js "https://market.aliyun.com/products/?spm=5176.730005-52734001.0.0.F9SiNi&priceTag=1-&pageIndex=$i" | tee out/$i.json;
sleep 5;
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment