Skip to content

Instantly share code, notes, and snippets.

@harrywang
Created September 11, 2019 13:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save harrywang/d19dc2092014327c05b0a18ee3147546 to your computer and use it in GitHub Desktop.
Save harrywang/d19dc2092014327c05b0a18ee3147546 to your computer and use it in GitHub Desktop.
from scrapy.item import Item, Field
from scrapy.loader.processors import MapCompose, TakeFirst
from datetime import datetime
def remove_quotes(text):
# strip the unicode quotes
text = text.strip(u'\u201c'u'\u201d')
return text
def convert_date(text):
# convert string March 14, 1879 to Python date
return datetime.strptime(text, '%B %d, %Y')
def parse_location(text):
# parse location "in Ulm, Germany"
# this simply remove "in ", you can further parse city, state, country, etc.
return text[3:]
class QuoteItem(Item):
quote_content = Field(
input_processor=MapCompose(remove_quotes),
# TakeFirst return the first value not the whole list
output_processor=TakeFirst()
)
author_name = Field(
input_processor=MapCompose(str.strip),
output_processor=TakeFirst()
)
author_birthday = Field(
input_processor=MapCompose(convert_date),
output_processor=TakeFirst()
)
author_bornlocation = Field(
input_processor=MapCompose(parse_location),
output_processor=TakeFirst()
)
author_bio = Field(
input_processor=MapCompose(str.strip),
output_processor=TakeFirst()
)
tags = Field()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment