Created
September 11, 2019 13:42
-
-
Save harrywang/d19dc2092014327c05b0a18ee3147546 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.item import Item, Field | |
from scrapy.loader.processors import MapCompose, TakeFirst | |
from datetime import datetime | |
def remove_quotes(text): | |
# strip the unicode quotes | |
text = text.strip(u'\u201c'u'\u201d') | |
return text | |
def convert_date(text): | |
# convert string March 14, 1879 to Python date | |
return datetime.strptime(text, '%B %d, %Y') | |
def parse_location(text): | |
# parse location "in Ulm, Germany" | |
# this simply remove "in ", you can further parse city, state, country, etc. | |
return text[3:] | |
class QuoteItem(Item): | |
quote_content = Field( | |
input_processor=MapCompose(remove_quotes), | |
# TakeFirst return the first value not the whole list | |
output_processor=TakeFirst() | |
) | |
author_name = Field( | |
input_processor=MapCompose(str.strip), | |
output_processor=TakeFirst() | |
) | |
author_birthday = Field( | |
input_processor=MapCompose(convert_date), | |
output_processor=TakeFirst() | |
) | |
author_bornlocation = Field( | |
input_processor=MapCompose(parse_location), | |
output_processor=TakeFirst() | |
) | |
author_bio = Field( | |
input_processor=MapCompose(str.strip), | |
output_processor=TakeFirst() | |
) | |
tags = Field() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment