Skip to content

Instantly share code, notes, and snippets.

@rafikahmed
Created May 26, 2020 12:48
Show Gist options
  • Save rafikahmed/70a38deb6ff11c2ad02173db6689d1a0 to your computer and use it in GitHub Desktop.
Save rafikahmed/70a38deb6ff11c2ad02173db6689d1a0 to your computer and use it in GitHub Desktop.
class MySpider(scrapy.Spider):
name = 'example'
current_page = 1
script = '''
function main(splash, args)
splash.private_mode_enabled = false
url = args.url
assert(splash:go(url))
assert(splash:wait(1))
assert(splash:run_js(args.js_source))
assert(splash:wait(1))
return splash:html()
end
'''
def start_requests(self):
# initial request
def parse(self, response):
# parse your data
#pagination
if self.current_page <= 5:
self.current_page += 1
js_source = f"javascript:__doPostBack('ctl00$MainContent$gvReport','Page${self.current_page}')"
yield SplashRequest(url='YOUR_URL', callback=self.parse, endpoint='execute', args={'lua_source': self.script, 'js_source':js_source}, dont_filter=True)
function main(splash, args)
splash.private_mode_enabled = false
url = args.url
assert(splash:go(url))
assert(splash:wait(1))
assert(splash:run_js(args.js_source))
assert(splash:wait(1))
return splash:html()
end
@kazekage92
Copy link

Trying to loop through multiple stock but they get mixed together

`
import scrapy
from scrapy_splash import SplashRequest

class StockQrSpider(scrapy.Spider):

name = 'stock_qr'
allowed_domains = ['www.malaysiastock.biz']
# start_urls = ['https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode=1155']
# stock_code = ['1155','6012']

current_page = 1
stock_url = ""

inital_script = '''
    function main(splash, args)
        splash.private_mode_enabled = true
        assert(splash:go(args.url))
        assert(splash:wait(3))
        return splash:html()
    end
'''

script = '''
    function main(splash, args)
        splash.private_mode_enabled = true
        assert(splash:go(args.url))
        assert(splash:wait(3))
        assert(splash:runjs(args.js_source))
        assert(splash:wait(5))
        return splash:html()
    end
'''

def __init__(self):
    self.stock_code = ['1155','6888','7029']

def start_requests(self):
    # Initial request doen't require the js_source script
    for i in self.stock_code:
        StockQrSpider.stock_url = 'https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode='+ i
        yield SplashRequest(url=self.stock_url , callback=self.parse_qr, endpoint='execute', args={
            'lua_source': self.inital_script
        })

def parse_qr(self, response):
    # name = response.request.meta['dividend_name']
    
    # for i in self.stock_code:
    #     stock_url = 'https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode='+ i



    rows = response.xpath(
        "//table[contains(@id, 'MainContent_gvReport')]/tbody/tr[position()<last()]")
    for row in rows:
        stockname = response.xpath("//div[contains(@id, 'MainContent_Panel')][1]/div/h1/label/text()").get()
        date = row.xpath(".//td[1]/text()").get()
        year = row.xpath(".//td[2]//text()").get()
        number = row.xpath(".//td[3]//text()").get()
        quarter = row.xpath(".//td[4]//text()").get()
        revenue = row.xpath(".//td[5]//text()").get()
        pbt = row.xpath(".//td[6]//text()").get()
        profit = row.xpath(".//td[7]//text()").get()
        eps = row.xpath(".//td[8]//text()").get()
        dividend = row.xpath(".//td[9]//text()").get()
        nta = row.xpath(".//td[10]//text()").get()

        yield{
            'stock_name': stockname,
            'date': date,
            'financial_year': year,
            'number': number,
            'financial_quarter': quarter,
            'revenue': revenue,
            'profit_before_tax': pbt,
            'net_profit': profit,
            'earnings_per_share': eps,
            'dividend': dividend,
            'net_tangible_asset': nta,
            'User-Agent': response.request.headers['User-Agent']
        }

# extract_last_page_number
    last_page = response.xpath(
        "//table[contains(@id, 'MainContent_gvReport')]/tbody/tr[position()=last()]/td/table/tbody/tr/td[position()=last()]/a/text()").get() #you forgot to use get() here
    

    #last_page should be converted to an integer.
    if last_page:
        if self.current_page <= int(last_page):
            self.current_page += 1
            js_source = f"javascript:__doPostBack('ctl00$MainContent$gvReport','Page${self.current_page}')"
            # here you used the parse method which doesn't exist (it should be parse_qr)
            print(self.stock_url)
            yield SplashRequest(url= self.stock_url , callback=self.parse_qr, endpoint='execute', args={'lua_source': self.script, 'js_source': js_source}, dont_filter=True)

`

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment