Skip to content

Instantly share code, notes, and snippets.

@IaroslavR
Created September 11, 2017 17:44
Show Gist options
  • Save IaroslavR/5052889cf76cb083ddc3246bc2e66826 to your computer and use it in GitHub Desktop.
Save IaroslavR/5052889cf76cb083ddc3246bc2e66826 to your computer and use it in GitHub Desktop.
import logging
import re
from lawyers.spiders.common import CommonSpider, kayessian_intersection
class LawyerSpider(CommonSpider):
name = 'robertsonshk'
start_urls = ('http://www.robertsonshk.com/en/people', )
common_fields = {
'firm': 'Robertsons (HK)',
'office': 'Hong Kong'
}
fields = {
'name': {
'xpath': "string(.//h1)",
},
'title': {
'xpath': "string(.//span[@class='people_title'])",
},
'picture_url': {
'xpath': "string(.//main//img[@class='img-responsive']/@src)",
},
'phone': {
'xpath': "string((.//span[contains(., 'Tel')]/..)[1]/text())",
},
'email': {
'xpath': "string((.//span[contains(., 'Email')]/following-sibling::script)[2])",
},
'vcard_url': {
'xpath': "string(.//a[contains(., 'Download vCard')]/@href)",
},
'overview': {'xpath': ".//div[@class='wysiwyg_content']"},
'selected_client_work': {
'xpath': kayessian_intersection(
"/*//div[@class='section_title']/following-sibling::*",
"/*//ul[@class='list-unstyled']/preceding-sibling::*",
first_if_empty=False
)
},
'languages': {
'xpath': ".//div/div/h2[.='EXPERTISE']/../following-sibling::*//text()",
'parser': lambda x: [e.strip('Speaks') for e in x if e.startswith('Speaks')]
},
'admissions': {
'xpath': ".//div/div/h2[.='QUALIFICATIONS']/../following-sibling::*"
}
}
pagination_items = ".//a[contains(@href, '/people/')]"
profile_url = "string(./@href)"
logger = logging.getLogger(__name__)
logger.setLevel('DEBUG')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment