Skip to content

Instantly share code, notes, and snippets.

@conredwang
Created November 11, 2016 21:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save conredwang/a49b7d9e52b4444e3fe2c69ba5b72556 to your computer and use it in GitHub Desktop.
Save conredwang/a49b7d9e52b4444e3fe2c69ba5b72556 to your computer and use it in GitHub Desktop.
Scraping Skyscrapers data using Scrapy. Code 2 of 3. pipelines.py
# -*- coding: utf-8 -*-
# author : Conred Wang
# title : Wed scraping using Scrapy. Code 2 of 3. pipelines.py
class WriteItemPipeline(object):
def __init__(self):
self.filename = 'tall.txt'
def open_spider(self, spider):
self.file = open(self.filename, 'wb')
def close_spider(self, spider):
self.file.close()
# Building Name (blgname) contains unicode (for example, "u2016"/double-vertical-line, "u2022"/bullet, etc),
# which will cause exception when applying str(). Use "encode('ascii','ignore')", instead of str(), to solve the issue.
def process_item(self, item, spider):
line = str(item['hgtRank']) + '|' + str(item['hgtFeet']) + '|' + \
item['blgName'].encode('ascii','ignore') + '|' + str(item['blgCity']) + '|' + \
str(item['blgCountry']) + '|' + str(item['blgFloor']) + '|' + str(item['blgPurpose']) + '|' + \
str(item['isMultiPurpose']) + '|' + \
str(item['forOffice']) + '|' + str(item['forResidential']) + '|' + \
str(item['forHotel']) + '|' + str(item['forRetail']) + '|' + \
str(item['yrPropose']) + '|' + str(item['yrStart']) + '|' + str(item['yrComplete']) + '|' + \
str(item['tmProposeStart']) + '|' + str(item['tmProposeComplete']) + '|' + str(item['tmStartComplete']) + '\n'
# following line, building url, is for debug purpose.
# str(item['blgUrl']) + '\n'
self.file.write(line)
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment