Skip to content

Instantly share code, notes, and snippets.

@jluczak
Last active August 3, 2017 13:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jluczak/c760f0e34f26a9d1a35697464004ce53 to your computer and use it in GitHub Desktop.
Save jluczak/c760f0e34f26a9d1a35697464004ce53 to your computer and use it in GitHub Desktop.
Scrap Google Experts
import scrapy
from scrapy.item import Item,Field
class ExpertItem(scrapy.Item):
name=Field()
tangline=Field()
file_urls=Field()
files=Field()
city=Field()
gplus=Field()
twitter=Field()
linkedin=Field()
bio=Field()
skills=Field()
BOT_NAME = 'expert'
SPIDER_MODULES = ['expert.spiders']
NEWSPIDER_MODULE = 'expert.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 1,
}
FILES_STORE = 'expert_photos'
from scrapy.spiders import CrawlSpider, Rule
from expert.items import ExpertItem
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
class MySpider(CrawlSpider):
name = "expert"
allowed_domains = ["developers.google.com"]
start_urls = ["https://developers.google.com/experts/"]
rules = (
Rule(LxmlLinkExtractor(
restrict_xpaths=(".//*[@id='experts-body']//a")),
follow=False,
callback='parse_item'
),
)
def parse_item(self, response):
sel = Selector(response)
item = ExpertItem()
item['name'] = sel.css('.profile-header h2::text').extract()
item['tangline'] = sel.css('h2.main-title::text').extract()
item['file_urls'] = sel.css('img.profile-img::attr(src)').extract()
item['city'] = sel.css('.location::text').extract_first()
item['gplus'] = sel.css('a.gplus::attr(href)').extract_first()
item['twitter'] = sel.css('a.twitter::attr(href)').extract_first()
item['linkedin'] = sel.css('a.linkedin::attr(href)').extract_first()
item['bio'] = sel.css('.profile-row p::text').extract()[2:5]
item['skills'] = sel.css('.profile-row li::text').extract()
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment