mihi-tr/Kenya-Gazette

## Kenya-Gazette
from pyquery import PyQuery
from AlchemyAPI import AlchemyAPI
from itertools import ifilter
import re
import lxml

url="http://www.kenyalaw.org/klr/index.php?id=441"

ao=AlchemyAPI()
ao.loadAPIKey("api-key.txt")

def properties(el):
  if len(el.getchildren()):
    return dict(((i.tag, properties(i)) for i in el.iterchildren()))
  else:
    return el.text

def ner(txt):
  results=ao.TextGetRankedNamedEntities(txt.replace(u"\u2014",""))
  r=lxml.etree.fromstring(results)
  return {"text": txt, "entities":[properties(e) for e in
  r.xpath("//entity")]}

def id(txt):
  m=re.match("^[^0-9]+([0-9]+).*?",txt)
  if m:
    return m.group(1)
  else:
    import hashlib
    return hashlib.sha1(txt).hexdigest()

def scrape(url,el="#inside-1"):
  pq=PyQuery(url)
  d=ifilter(lambda x: re.search(" appoint",x),
  pq(el).text().split("GAZETTE NOTICE"))
  return dict(((id(i), ner(i)) for i in d))


if __name__=="__main__":
  import json
  print json.dumps(scrape(url))
	from pyquery import PyQuery
	from AlchemyAPI import AlchemyAPI
	from itertools import ifilter
	import re
	import lxml

	url="http://www.kenyalaw.org/klr/index.php?id=441"

	ao=AlchemyAPI()
	ao.loadAPIKey("api-key.txt")

	def properties(el):
	if len(el.getchildren()):
	return dict(((i.tag, properties(i)) for i in el.iterchildren()))
	else:
	return el.text

	def ner(txt):
	results=ao.TextGetRankedNamedEntities(txt.replace(u"\u2014",""))
	r=lxml.etree.fromstring(results)
	return {"text": txt, "entities":[properties(e) for e in
	r.xpath("//entity")]}

	def id(txt):
	m=re.match("^[^0-9]+([0-9]+).*?",txt)
	if m:
	return m.group(1)
	else:
	import hashlib
	return hashlib.sha1(txt).hexdigest()

	def scrape(url,el="#inside-1"):
	pq=PyQuery(url)
	d=ifilter(lambda x: re.search(" appoint",x),
	pq(el).text().split("GAZETTE NOTICE"))
	return dict(((id(i), ner(i)) for i in d))


	if __name__=="__main__":
	import json
	print json.dumps(scrape(url))