Skip to content

Instantly share code, notes, and snippets.

@b5
Created June 17, 2019 18:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save b5/67e4ade3b2325df9478d1dacd09fad97 to your computer and use it in GitHub Desktop.
Save b5/67e4ade3b2325df9478d1dacd09fad97 to your computer and use it in GitHub Desktop.
load("http.star", "http")
load("bsoup.star", "bsoup")
baseUrl = "http://www.lawhelp.org"
def download(ctx):
soup, rows = fetch_page(baseUrl + '/dc/find-legal-help/directory/')
pages = page_links(soup)
for page in pages:
_, new_rows = fetch_page(page)
rows = rows + new_rows
return rows
def transform(ds,ctx):
ds.set_structure(structure)
ds.set_body(ctx.download)
def fetch_page(url):
soup = get_soup(url)
rows = soup.find('ul', {'class':'listing'}).find_all('li')
rows = [extract_resource_index(row) for row in rows]
return (soup, rows)
def page_links(soup):
els = soup.find('div', { 'class' : 'pagination' }).contents()
return [baseUrl + el.attrs()['href'] for el in els if
el.attrs().get('href', '') != '' and el.attrs().get('class', '') != 'next']
# schema:
# 0 name
# 1 description
# 2 locality
# 3 street_address
# 4 postal_code
# 5 telephone
# 6 website
# 7 resource_url
# 8 updated
def extract_resource_index(soup):
data = [''] * 9
h3 = soup.find('h3')
if h3:
link = h3.find('a')
# resource_url:
data[7] = baseUrl + link.attrs()['href'].strip()
# name:
data[0] = link.get_text().strip()
# fetch details from sub page
details = get_resource_details(data[7])
data[8] = details['updated'].strip()
data[1] = details["description"].strip()
sa = soup.find('span', { 'class': 'street-address' })
if sa:
# street_address:
data[3] = sa.get_text().strip()
loc = soup.find('span', { 'class': 'locality'})
if loc:
# locality
data[2] = loc.get_text().strip()
pc = soup.find('span', {'class': 'postal-code'})
if pc:
# postal_code
data[4] = pc.get_text().strip()
tel = soup.find('div', {'class': 'tel'})
if tel:
# telephone
data[5] = tel.get_text().strip()
site = soup.find('div', { 'class': 'wrap-all' })
if site:
# website
data[6] = site.find('a').attrs()['href']
return data
def get_resource_details(rel):
soup = get_soup(rel)
details = {
'updated': '',
'description': '',
}
profile = soup.find('div', { 'id': 'profile-tab' })
if profile:
sections = profile.find_all('div', { 'class': 'section'})
if len(sections) > 0:
details['description'] = "\n".join([p.get_text() for p in sections[0].find_all('p')])
card = soup.find('div', { 'class': 'vcard' })
if card:
if len(card.contents()) >= 4:
details['updated'] = card.contents()[3].get_text().replace('Last Review and Update:', '').strip()
return details
def get_soup(url):
print(url, "\n")
res = http.get(url, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0",
})
return bsoup.parseHtml(res.body())
structure = {
'format': 'json',
'strict': True,
'schema': {
'type': 'array',
'items': {
'type' : 'array',
'items': [
{ 'title': 'name', 'type': 'string' },
{ 'title': 'description', 'type': 'string' },
{ 'title': 'locality', 'type': 'string' },
{ 'title': 'street_address', 'type': 'string' },
{ 'title': 'postal_code', 'type': 'string' },
{ 'title': 'telephone', 'type': 'string' },
{ 'title': 'website', 'type': 'string' },
{ 'title': 'resource_url ', 'type': 'string' },
{ 'title': 'updated', 'type': 'string' },
]
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment