Skip to content

Instantly share code, notes, and snippets.

@ylxdzsw
Last active April 10, 2019 12:20
Show Gist options
  • Save ylxdzsw/3b619a78c86cdfddd7cbb7107694789a to your computer and use it in GitHub Desktop.
Save ylxdzsw/3b619a78c86cdfddd7cbb7107694789a to your computer and use it in GitHub Desktop.
some spiders
using JSON2
using OhMyJulia
using ProgressMeter
f = open("maintain.tsv", "w")
prt(f, "service", "category", "product", "type", "tip", "info", "tags",
"brand", "order_quantity", "sales_quantity", "comments", "price", "marketing_price", "unit", "name", "rate", "remark")
import Base.print
print(::IOStream, ::Nothing) = nothing
box(::Nothing) = []
box(x) = x
@showprogress for link in readlines("baoyang_details.jsons")
page = JSON2.read(link)
for category in box(page.products), product in box(category.Items), item in box(product.Items), variant in box(item.Products)
prt(f, page.title, category.CategoryName, product.ZhName, item.ZhName,
product.SuggestTip, item.DataTip, join(map(x->x.Tag, variant.Tags), ','),
variant.Product.Brand, variant.Product.OrderQuantity, variant.Product.SalesQuantity, variant.Product.CommentTimes,
variant.Product.Price, variant.Product.MarketingPrice, variant.Product.Unit, variant.Product.DisplayName,
variant.Product.RateNumber, variant.Product.Remark)
end
end
using JSON2
a = readlines("baoyang_details.jsons");
a = map(JSON2.read, a);
b = JSON2.read(open("baoyang_list.json"))
set = Set(x.url for x in a)
remaining = [x for x in b if x.href ∉ set]
f = open("baoyang_list_remaining.json", "w")
write(f, JSON2.write(remaining))
const fs = require('fs')
require('puppeteer').launch({ args: ['--no-sandbox'] }).then(async browser => {
let i = 0
async function getone(title, url) {
const page = await browser.newPage()
page.on('response', async res => {
if (res.url().match(/GetBaoYangPackages\.html/)) {
fs.appendFile("baoyang_details.jsons", JSON.stringify({
title, url, products: await res.json()
}) + '\n', e => {})
console.info(`${i++} succeed`)
}
})
await page.goto(url)
await new Promise((res, rej) => setTimeout(res, 50))
await page.close()
}
const list = JSON.parse(fs.readFileSync("baoyang_list_remaining.json"))
await Promise.all([1,2,3,4,5,6,7,8].map(async () => {
while (list.length) {
try {
const x = list.shift()
await getone(x.title, x.href)
} catch (e) { // we do not handle errors here. Will fix missing items after finishing
console.error(e)
}
}
}))
console.info("finished!")
await browser.close()
})
import scrapy
class TuhuSpider(scrapy.Spider):
name = 'tuhu'
start_urls = ['https://item.tuhu.cn/List/AP/%s.html' % (i+1) for i in range(395)]
def parse_detail(self, res):
info = res.css('#product_detail .proudct_info') # the typo is the website's, not mine
yield {
"url": res.url,
"category": res.css(".bread_navi a ::text").getall(),
"title": info.css('.DisplayName ::text').get(),
"properties": [{"name": x.css('span::text').get(), "value": x.css('::text').get()} for x in info.css('.properties li')],
"normal_price": info.css('.normal_price .price strong::text').get(),
"tuhu_price": info.css('.flashsale_price .price strong::text').get(),
"number_of_comments": info.css('.buy_person .person_shu::text').get(),
"rating": res.css('.comment_statistics .num::text').get()
}
for link in info.css("dd.unit > a ::attr(href)").getall(): # scrapy auto deduplicates
yield scrapy.Request(link, callback=self.parse_detail)
def parse(self, response):
for link in response.css('.cpli > a:nth-child(1)::attr(href)').getall():
yield scrapy.Request(link, callback=self.parse_detail)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment