public
Created

Python script to parse The Daily iPad app's JSON and spit out an HTML index

  • Download Gist
parse_daily.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
#!/usr/bin/python
 
import json, sys, re
filename = sys.argv[1]
f = open(filename, 'r')
contents = json.load(f)
 
contentIndex = {}
 
for content in contents['content']:
if content['type'] == 'text':
identifier = content['identifier']
contentIndex[identifier] = content['text']
headlineIndex = {}
deckIndex = {}
bylineIndex = {}
 
for section in contents['sections']:
print '<h3>' + section['name'] + '</h3>'
print '<ol>'
for page in section['pages']:
if page['displayOnCarousel'] == True:
if 'webURL' in page:
webURL = page['webURL']
layoutIndex = {}
for layout in page['layouts']:
for component in layout['components']:
if 'contentRefs' in component:
if len(component['contentRefs']) > 0:
contentID = component['contentRefs'][0]['contentID']
if contentID in contentIndex:
if re.search("Head", component['title'], re.IGNORECASE):
headline = re.sub('\n', ' ', contentIndex[contentID])
headlineIndex[webURL] = headline
if re.search("Hed", component['title'], re.IGNORECASE):
headline = re.sub('\n', ' ', contentIndex[contentID])
headlineIndex[webURL] = headline
if re.search("Deck", component['title'], re.IGNORECASE):
deck = re.sub('\n', ' ', contentIndex[contentID])
deckIndex[webURL] = deck
if re.search("Byline", component['title'], re.IGNORECASE):
byline = re.sub('\n', ' ', contentIndex[contentID])
bylineIndex[webURL] = byline
 
print '<li>'
if 'headline' in page:
print '<a href="' + webURL + '">' + page['headline'] + '</a>'
elif webURL in headlineIndex:
print '<a href="' + webURL + '">' + headlineIndex[webURL] + '</a>'
else:
print '<a href="' + webURL + '">' + page['title'] + '</a>'
 
if webURL in bylineIndex:
print bylineIndex[webURL]
if webURL in deckIndex:
print '&mdash;', deckIndex[webURL]
elif 'openingLine' in page:
print '&mdash;', page['openingLine']
print '</li>'
print '</ol>'

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.