Skip to content

Instantly share code, notes, and snippets.

@amirulasyraf88
Created December 31, 2015 16:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amirulasyraf88/55b5938dd220c5d883e7 to your computer and use it in GitHub Desktop.
Save amirulasyraf88/55b5938dd220c5d883e7 to your computer and use it in GitHub Desktop.
Spider Web Crawler [ Mudah.my ]
#Extracting first page of http://www.mudah.my/malaysia/cars-for-sale
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from items import NewsItem
filename = "output.csv"
fields = ["headline", "price", "url"] # define fields to use
with open(filename,'a+') as f: # handle the source file
f.write("{}\n".format(';'.join(str(field) for field in fields))) # write header
class BbcSpider(CrawlSpider):
name = "mudahspider"
allowed_domains = ["mudah.my"]
start_urls = ('http://www.mudah.my/malaysia/cars-for-sale',)
def parse(self, response):
hxs = Selector(response)
titles = hxs.xpath('//div[@id="ContainerMain"]')
with open(filename,'a+') as f: # handle the source file
for title in titles:
story = NewsItem()
story['url'] = title.xpath('//div[@class="top_params"]/h2[@class="list_title truncate"]/a/@href').extract()
story['headline'] = title.xpath('//div[@class="top_params"]/h2[@class="list_title truncate"]/a/@title').extract()
story['price'] = title.xpath('//div[@class="middle_params"]/div[@class="ads_price"]/meta[@itemprop="price"]/@content').extract()
for index in range(0,len(story['url'])-1):
f.write("{}\n".format(';'.join(str(story[field][index]) for field in fields))) # write items
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment