Skip to content

Instantly share code, notes, and snippets.

@mihi-tr
Last active December 20, 2015 15:59
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mihi-tr/6158738 to your computer and use it in GitHub Desktop.
Save mihi-tr/6158738 to your computer and use it in GitHub Desktop.
Scraping the Kenya-gazette + AlchemyAPI entity recognition.
from pyquery import PyQuery
from AlchemyAPI import AlchemyAPI
from itertools import ifilter
import re
import lxml
url="http://www.kenyalaw.org/klr/index.php?id=441"
ao=AlchemyAPI()
ao.loadAPIKey("api-key.txt")
def properties(el):
if len(el.getchildren()):
return dict(((i.tag, properties(i)) for i in el.iterchildren()))
else:
return el.text
def ner(txt):
results=ao.TextGetRankedNamedEntities(txt.replace(u"\u2014",""))
r=lxml.etree.fromstring(results)
return {"text": txt, "entities":[properties(e) for e in
r.xpath("//entity")]}
def id(txt):
m=re.match("^[^0-9]+([0-9]+).*?",txt)
if m:
return m.group(1)
else:
import hashlib
return hashlib.sha1(txt).hexdigest()
def scrape(url,el="#inside-1"):
pq=PyQuery(url)
d=ifilter(lambda x: re.search(" appoint",x),
pq(el).text().split("GAZETTE NOTICE"))
return dict(((id(i), ner(i)) for i in d))
if __name__=="__main__":
import json
print json.dumps(scrape(url))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment