ylxdzsw/baoyang_to_tsv.jl

## baoyang_to_tsv.jl
using JSON2
using OhMyJulia
using ProgressMeter

f = open("maintain.tsv", "w")

prt(f, "service", "category", "product", "type", "tip", "info", "tags",
    "brand", "order_quantity", "sales_quantity", "comments", "price", "marketing_price", "unit", "name", "rate", "remark")

import Base.print
print(::IOStream, ::Nothing) = nothing

box(::Nothing) = []
box(x) = x

@showprogress for link in readlines("baoyang_details.jsons")
    page = JSON2.read(link)
    for category in box(page.products), product in box(category.Items), item in box(product.Items), variant in box(item.Products)
        prt(f, page.title, category.CategoryName, product.ZhName, item.ZhName,
            product.SuggestTip, item.DataTip, join(map(x->x.Tag, variant.Tags), ','),
            variant.Product.Brand, variant.Product.OrderQuantity, variant.Product.SalesQuantity, variant.Product.CommentTimes,
            variant.Product.Price, variant.Product.MarketingPrice, variant.Product.Unit, variant.Product.DisplayName,
            variant.Product.RateNumber, variant.Product.Remark)
    end
end

## get_remaining.jl
using JSON2
a = readlines("baoyang_details.jsons");
a = map(JSON2.read, a);
b = JSON2.read(open("baoyang_list.json"))
set = Set(x.url for x in a)
remaining = [x for x in b if x.href ∉ set]
f = open("baoyang_list_remaining.json", "w")
write(f, JSON2.write(remaining))

## tuhu_baoyang.js
const fs = require('fs')

require('puppeteer').launch({ args: ['--no-sandbox'] }).then(async browser => {
    let i = 0

    async function getone(title, url) {
        const page = await browser.newPage()
        page.on('response', async res => {
            if (res.url().match(/GetBaoYangPackages\.html/)) {
                fs.appendFile("baoyang_details.jsons", JSON.stringify({
                    title, url, products: await res.json()
                }) + '\n', e => {})
                console.info(`${i++} succeed`)
            }
        })
        await page.goto(url)
        await new Promise((res, rej) => setTimeout(res, 50))
        await page.close()
    }

    const list = JSON.parse(fs.readFileSync("baoyang_list_remaining.json"))
    await Promise.all([1,2,3,4,5,6,7,8].map(async () => {
        while (list.length) {
            try {
                const x = list.shift()
                await getone(x.title, x.href)
            } catch (e) { // we do not handle errors here. Will fix missing items after finishing
                console.error(e)
            }
        }
    }))

    console.info("finished!")
    await browser.close()
})

## tuhu_item.py
import scrapy

class TuhuSpider(scrapy.Spider):
    name = 'tuhu'
    start_urls = ['https://item.tuhu.cn/List/AP/%s.html' % (i+1) for i in range(395)]

    def parse_detail(self, res):
        info = res.css('#product_detail .proudct_info') # the typo is the website's, not mine
        yield {
            "url": res.url,
            "category": res.css(".bread_navi a ::text").getall(),
            "title": info.css('.DisplayName ::text').get(),
            "properties": [{"name": x.css('span::text').get(), "value": x.css('::text').get()} for x in info.css('.properties li')],
            "normal_price": info.css('.normal_price .price strong::text').get(),
            "tuhu_price": info.css('.flashsale_price .price strong::text').get(),
            "number_of_comments": info.css('.buy_person .person_shu::text').get(),
            "rating": res.css('.comment_statistics .num::text').get()
        }

        for link in info.css("dd.unit > a ::attr(href)").getall(): # scrapy auto deduplicates
            yield scrapy.Request(link, callback=self.parse_detail)

    def parse(self, response):
        for link in response.css('.cpli > a:nth-child(1)::attr(href)').getall():
            yield scrapy.Request(link, callback=self.parse_detail)
	using JSON2
	using OhMyJulia
	using ProgressMeter

	f = open("maintain.tsv", "w")

	prt(f, "service", "category", "product", "type", "tip", "info", "tags",
	"brand", "order_quantity", "sales_quantity", "comments", "price", "marketing_price", "unit", "name", "rate", "remark")

	import Base.print
	print(::IOStream, ::Nothing) = nothing

	box(::Nothing) = []
	box(x) = x

	@showprogress for link in readlines("baoyang_details.jsons")
	page = JSON2.read(link)
	for category in box(page.products), product in box(category.Items), item in box(product.Items), variant in box(item.Products)
	prt(f, page.title, category.CategoryName, product.ZhName, item.ZhName,
	product.SuggestTip, item.DataTip, join(map(x->x.Tag, variant.Tags), ','),
	variant.Product.Brand, variant.Product.OrderQuantity, variant.Product.SalesQuantity, variant.Product.CommentTimes,
	variant.Product.Price, variant.Product.MarketingPrice, variant.Product.Unit, variant.Product.DisplayName,
	variant.Product.RateNumber, variant.Product.Remark)
	end
	end
	using JSON2
	a = readlines("baoyang_details.jsons");
	a = map(JSON2.read, a);
	b = JSON2.read(open("baoyang_list.json"))
	set = Set(x.url for x in a)
	remaining = [x for x in b if x.href ∉ set]
	f = open("baoyang_list_remaining.json", "w")
	write(f, JSON2.write(remaining))
	const fs = require('fs')

	require('puppeteer').launch({ args: ['--no-sandbox'] }).then(async browser => {
	let i = 0

	async function getone(title, url) {
	const page = await browser.newPage()
	page.on('response', async res => {
	if (res.url().match(/GetBaoYangPackages\.html/)) {
	fs.appendFile("baoyang_details.jsons", JSON.stringify({
	title, url, products: await res.json()
	}) + '\n', e => {})
	console.info(`${i++} succeed`)
	}
	})
	await page.goto(url)
	await new Promise((res, rej) => setTimeout(res, 50))
	await page.close()
	}

	const list = JSON.parse(fs.readFileSync("baoyang_list_remaining.json"))
	await Promise.all([1,2,3,4,5,6,7,8].map(async () => {
	while (list.length) {
	try {
	const x = list.shift()
	await getone(x.title, x.href)
	} catch (e) { // we do not handle errors here. Will fix missing items after finishing
	console.error(e)
	}
	}
	}))

	console.info("finished!")
	await browser.close()
	})
	import scrapy

	class TuhuSpider(scrapy.Spider):
	name = 'tuhu'
	start_urls = ['https://item.tuhu.cn/List/AP/%s.html' % (i+1) for i in range(395)]

	def parse_detail(self, res):
	info = res.css('#product_detail .proudct_info') # the typo is the website's, not mine
	yield {
	"url": res.url,
	"category": res.css(".bread_navi a ::text").getall(),
	"title": info.css('.DisplayName ::text').get(),
	"properties": [{"name": x.css('span::text').get(), "value": x.css('::text').get()} for x in info.css('.properties li')],
	"normal_price": info.css('.normal_price .price strong::text').get(),
	"tuhu_price": info.css('.flashsale_price .price strong::text').get(),
	"number_of_comments": info.css('.buy_person .person_shu::text').get(),
	"rating": res.css('.comment_statistics .num::text').get()
	}

	for link in info.css("dd.unit > a ::attr(href)").getall(): # scrapy auto deduplicates
	yield scrapy.Request(link, callback=self.parse_detail)

	def parse(self, response):
	for link in response.css('.cpli > a:nth-child(1)::attr(href)').getall():
	yield scrapy.Request(link, callback=self.parse_detail)