b5/download.py

## download.py
load("http.star", "http")
load("bsoup.star", "bsoup")

baseUrl = "http://www.lawhelp.org"

def download(ctx):
  soup, rows = fetch_page(baseUrl + '/dc/find-legal-help/directory/')

  pages = page_links(soup)

  for page in pages:
    _, new_rows = fetch_page(page)
    rows = rows + new_rows

  return rows

def transform(ds,ctx):
  ds.set_structure(structure)
  ds.set_body(ctx.download)

def fetch_page(url):
  soup = get_soup(url)
  rows = soup.find('ul', {'class':'listing'}).find_all('li')
  rows = [extract_resource_index(row) for row in rows]
  return (soup, rows)

def page_links(soup):
  els = soup.find('div', { 'class' : 'pagination' }).contents()
  return [baseUrl + el.attrs()['href'] for el in els if
    el.attrs().get('href', '') != '' and el.attrs().get('class', '') != 'next']

# schema:
#  0 name
#  1 description
#  2 locality
#  3 street_address
#  4 postal_code
#  5 telephone
#  6 website
#  7 resource_url
#  8 updated
def extract_resource_index(soup):
  data = [''] * 9

  h3 = soup.find('h3')
  if h3:
    link = h3.find('a')
    # resource_url:
    data[7] = baseUrl + link.attrs()['href'].strip()
    # name:
    data[0] = link.get_text().strip()
    # fetch details from sub page
    details = get_resource_details(data[7])
    data[8] = details['updated'].strip()
    data[1] = details["description"].strip()

  sa = soup.find('span', { 'class': 'street-address' })
  if sa:
    # street_address:
    data[3] = sa.get_text().strip()

  loc = soup.find('span', { 'class': 'locality'})
  if loc:
    # locality
    data[2] = loc.get_text().strip()

  pc = soup.find('span', {'class': 'postal-code'})
  if pc:
    # postal_code
    data[4] = pc.get_text().strip()

  tel = soup.find('div', {'class': 'tel'})
  if tel:
    # telephone
    data[5] = tel.get_text().strip()

  site = soup.find('div', { 'class': 'wrap-all' })
  if site:
    # website
    data[6] = site.find('a').attrs()['href']

  return data

def get_resource_details(rel):
  soup = get_soup(rel)
  details = {
    'updated': '',
    'description': '',
  }

  profile = soup.find('div', { 'id': 'profile-tab' })
  if profile:
    sections = profile.find_all('div', { 'class': 'section'})
    if len(sections) > 0:
      details['description'] = "\n".join([p.get_text() for p in sections[0].find_all('p')])

  card = soup.find('div', { 'class': 'vcard' })
  if card:
    if len(card.contents()) >= 4:
      details['updated'] = card.contents()[3].get_text().replace('Last Review and Update:', '').strip()

  return details

def get_soup(url):
  print(url, "\n")
  res = http.get(url, headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0",
  })
  return bsoup.parseHtml(res.body())


structure = {
  'format': 'json',
  'strict': True,
  'schema': {
   'type': 'array',
   'items': {
     'type' : 'array',
     'items': [
        { 'title': 'name', 'type': 'string' },
        { 'title': 'description', 'type': 'string' },
        { 'title': 'locality', 'type': 'string' },
        { 'title': 'street_address', 'type': 'string' },
        { 'title': 'postal_code', 'type': 'string' },
        { 'title': 'telephone', 'type': 'string' },
        { 'title': 'website', 'type': 'string' },
        { 'title': 'resource_url ', 'type': 'string' },
        { 'title': 'updated', 'type': 'string' },
     ]
   }
  }
}
	load("http.star", "http")
	load("bsoup.star", "bsoup")

	baseUrl = "http://www.lawhelp.org"

	def download(ctx):
	soup, rows = fetch_page(baseUrl + '/dc/find-legal-help/directory/')

	pages = page_links(soup)

	for page in pages:
	_, new_rows = fetch_page(page)
	rows = rows + new_rows

	return rows

	def transform(ds,ctx):
	ds.set_structure(structure)
	ds.set_body(ctx.download)

	def fetch_page(url):
	soup = get_soup(url)
	rows = soup.find('ul', {'class':'listing'}).find_all('li')
	rows = [extract_resource_index(row) for row in rows]
	return (soup, rows)

	def page_links(soup):
	els = soup.find('div', { 'class' : 'pagination' }).contents()
	return [baseUrl + el.attrs()['href'] for el in els if
	el.attrs().get('href', '') != '' and el.attrs().get('class', '') != 'next']

	# schema:
	# 0 name
	# 1 description
	# 2 locality
	# 3 street_address
	# 4 postal_code
	# 5 telephone
	# 6 website
	# 7 resource_url
	# 8 updated
	def extract_resource_index(soup):
	data = [''] * 9

	h3 = soup.find('h3')
	if h3:
	link = h3.find('a')
	# resource_url:
	data[7] = baseUrl + link.attrs()['href'].strip()
	# name:
	data[0] = link.get_text().strip()
	# fetch details from sub page
	details = get_resource_details(data[7])
	data[8] = details['updated'].strip()
	data[1] = details["description"].strip()

	sa = soup.find('span', { 'class': 'street-address' })
	if sa:
	# street_address:
	data[3] = sa.get_text().strip()

	loc = soup.find('span', { 'class': 'locality'})
	if loc:
	# locality
	data[2] = loc.get_text().strip()

	pc = soup.find('span', {'class': 'postal-code'})
	if pc:
	# postal_code
	data[4] = pc.get_text().strip()

	tel = soup.find('div', {'class': 'tel'})
	if tel:
	# telephone
	data[5] = tel.get_text().strip()

	site = soup.find('div', { 'class': 'wrap-all' })
	if site:
	# website
	data[6] = site.find('a').attrs()['href']

	return data

	def get_resource_details(rel):
	soup = get_soup(rel)
	details = {
	'updated': '',
	'description': '',
	}

	profile = soup.find('div', { 'id': 'profile-tab' })
	if profile:
	sections = profile.find_all('div', { 'class': 'section'})
	if len(sections) > 0:
	details['description'] = "\n".join([p.get_text() for p in sections[0].find_all('p')])

	card = soup.find('div', { 'class': 'vcard' })
	if card:
	if len(card.contents()) >= 4:
	details['updated'] = card.contents()[3].get_text().replace('Last Review and Update:', '').strip()

	return details

	def get_soup(url):
	print(url, "\n")
	res = http.get(url, headers={
	"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0",
	})
	return bsoup.parseHtml(res.body())


	structure = {
	'format': 'json',
	'strict': True,
	'schema': {
	'type': 'array',
	'items': {
	'type' : 'array',
	'items': [
	{ 'title': 'name', 'type': 'string' },
	{ 'title': 'description', 'type': 'string' },
	{ 'title': 'locality', 'type': 'string' },
	{ 'title': 'street_address', 'type': 'string' },
	{ 'title': 'postal_code', 'type': 'string' },
	{ 'title': 'telephone', 'type': 'string' },
	{ 'title': 'website', 'type': 'string' },
	{ 'title': 'resource_url ', 'type': 'string' },
	{ 'title': 'updated', 'type': 'string' },
	]
	}
	}
	}