Skip to content

Instantly share code, notes, and snippets.

@jsyeo
Created July 17, 2012 19:04
Show Gist options
  • Save jsyeo/3131310 to your computer and use it in GitHub Desktop.
Save jsyeo/3131310 to your computer and use it in GitHub Desktop.
Collecting meta data of ODJ articles
#!/usr/bin/python -tt
import urllib
from lxml import etree
import simplejson
def main():
'''main method:
code here'''
f = urllib.urlopen("http://ymiblogging.org/category/devotional/odj/")
body = f.read()
html = etree.HTML(body)
#xpath selection
titles = html.xpath("//h3/a/text()")
urls = [elem.attrib["href"] for elem in html.xpath("//h3/a")]
dates = html.xpath("//div[@class='post-meta']/strong/text()")
if len(titles) is len(urls) and len(urls) is len(dates):
odj_post = {}
json_file = open("feed.json", "w")
# for each title dump a json rep of the dict
for (i, title) in enumerate(titles):
odj_post["title"] = title
odj_post["url"] = urls[i]
odj_post["date"] = dates[i]
json_file.write(simplejson.dumps(odj_post))
json_file.write("\n")
print "JSON output written to feed.json"
else:
print "HTML tag mis-match"
return
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment