Skip to content

Instantly share code, notes, and snippets.

@risatrix
Last active March 30, 2016 21:28
Show Gist options
  • Save risatrix/e2adf8ce1efbc9fb14e0 to your computer and use it in GitHub Desktop.
Save risatrix/e2adf8ce1efbc9fb14e0 to your computer and use it in GitHub Desktop.
Ugly Soup for You!
class Command(NoArgsCommand):
help = "Output flatpage info as a csv"
print 'hello, flatpage'
def handle(self, **options):
pages = FlatPage.objects.all()
test_pages = FlatPage.objects.filter(url__icontains='test')
interactive_pages = FlatPage.objects.filter(url__icontains='interactive')
js_pages = FlatPage.objects.all().exclude(extra_js=u'')
# reference material
# http://stackoverflow.com/questions/8949145/filter-django-database-for-field-containing-any-value-in-an-array/8949234
# http://stackoverflow.com/questions/2171159/queryset-for-non-empty-textfield
# Good example of flatpage that looks like an article:
# https://www.texastribune.org/admin/flatpages/flatpage/598/
# uncomment when you want to output things
out = csv.writer(open('actual_page_audit.csv', 'wb'))
out.writerow(("Name", "Authors", "Date", "URL" ))
for p in pages:
date = ''
author_string = ''
# look for a byline
soup = BeautifulSoup(p.content)
meta = soup.find("ul", { "class" : "meta" })
if meta != None:
date = meta.find("li", { "class" : None }).text
authors = meta.find_all("a")
author_list = []
for a in authors:
author_list.append(a.text)
# author_string = str(author_list).strip('[]').encode("utf-8")
author_string = ', '.join(author_list)
# get the url
abs_url = p.get_absolute_url()
# if there's not a date in the byline, find date in url, if there is one
if date == '':
a = ['2010', '2011', '2012', '2013', '2014', '2015', '2016']
for item in a:
if abs_url.find(item) != -1:
date = item
# test for js
has_js = ''
if p.extra_js != u'':
has_js = 'yes'
else:
has_js = 'no'
#test for test page
is_test = ''
if abs_url.find("test") != -1:
is_test = 'yes'
else:
is_test = 'no'
# test for interactive
is_interactive = ''
if abs_url.find("interactive") != -1:
is_interactive = 'yes'
else:
is_interactive = 'no'
# uncomment when you want to output things
# use this to filter whatever type of pages
if meta == None and is_test == 'no':
row = [
p.title,
author_string,
date,
'http://www.texastribune.org' + abs_url,
]
out.writerow([unicode(item).encode('utf8') for item in row])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment