risatrix/soup.py

## soup.py
class Command(NoArgsCommand):
    help = "Output flatpage info as a csv"

    print 'hello, flatpage'

    def handle(self, **options):
        pages = FlatPage.objects.all()
        test_pages = FlatPage.objects.filter(url__icontains='test')
        interactive_pages = FlatPage.objects.filter(url__icontains='interactive')
        js_pages = FlatPage.objects.all().exclude(extra_js=u'')

        # reference material
        # http://stackoverflow.com/questions/8949145/filter-django-database-for-field-containing-any-value-in-an-array/8949234
        # http://stackoverflow.com/questions/2171159/queryset-for-non-empty-textfield
        # Good example of flatpage that looks like an article:
        # https://www.texastribune.org/admin/flatpages/flatpage/598/

        # uncomment when you want to output things
        out = csv.writer(open('actual_page_audit.csv', 'wb'))
        out.writerow(("Name", "Authors", "Date", "URL" ))

        for p in pages:
            date = ''
            author_string = ''

            # look for a byline
            soup = BeautifulSoup(p.content)
            meta = soup.find("ul", { "class" : "meta" })
            if meta != None:
                date = meta.find("li", { "class" : None }).text
                authors = meta.find_all("a")
                author_list = []
                for a in authors:
                    author_list.append(a.text)
                # author_string = str(author_list).strip('[]').encode("utf-8")
                author_string = ', '.join(author_list)


            # get the url
            abs_url = p.get_absolute_url()

            # if there's not a date in the byline, find date in url, if there is one
            if date == '':
                a = ['2010', '2011', '2012', '2013', '2014', '2015', '2016']
                for item in a:
                    if abs_url.find(item) != -1:
                        date = item

            # test for js
            has_js = ''
            if p.extra_js != u'':
                has_js = 'yes'
            else:
                has_js = 'no'

            #test for test page
            is_test = ''
            if abs_url.find("test") != -1:
                is_test = 'yes'
            else:
                is_test = 'no'

            # test for interactive
            is_interactive = ''
            if abs_url.find("interactive") != -1:
                is_interactive = 'yes'
            else:
                is_interactive = 'no'

            # uncomment when you want to output things
            # use this to filter whatever type of pages

            if meta == None and is_test == 'no':
                row = [
                    p.title,
                    author_string,
                    date,
                    'http://www.texastribune.org' + abs_url,
                ]
                out.writerow([unicode(item).encode('utf8') for item in row])
	class Command(NoArgsCommand):
	help = "Output flatpage info as a csv"

	print 'hello, flatpage'

	def handle(self, **options):
	pages = FlatPage.objects.all()
	test_pages = FlatPage.objects.filter(url__icontains='test')
	interactive_pages = FlatPage.objects.filter(url__icontains='interactive')
	js_pages = FlatPage.objects.all().exclude(extra_js=u'')

	# reference material
	# http://stackoverflow.com/questions/8949145/filter-django-database-for-field-containing-any-value-in-an-array/8949234
	# http://stackoverflow.com/questions/2171159/queryset-for-non-empty-textfield
	# Good example of flatpage that looks like an article:
	# https://www.texastribune.org/admin/flatpages/flatpage/598/

	# uncomment when you want to output things
	out = csv.writer(open('actual_page_audit.csv', 'wb'))
	out.writerow(("Name", "Authors", "Date", "URL" ))

	for p in pages:
	date = ''
	author_string = ''

	# look for a byline
	soup = BeautifulSoup(p.content)
	meta = soup.find("ul", { "class" : "meta" })
	if meta != None:
	date = meta.find("li", { "class" : None }).text
	authors = meta.find_all("a")
	author_list = []
	for a in authors:
	author_list.append(a.text)
	# author_string = str(author_list).strip('[]').encode("utf-8")
	author_string = ', '.join(author_list)


	# get the url
	abs_url = p.get_absolute_url()

	# if there's not a date in the byline, find date in url, if there is one
	if date == '':
	a = ['2010', '2011', '2012', '2013', '2014', '2015', '2016']
	for item in a:
	if abs_url.find(item) != -1:
	date = item

	# test for js
	has_js = ''
	if p.extra_js != u'':
	has_js = 'yes'
	else:
	has_js = 'no'

	#test for test page
	is_test = ''
	if abs_url.find("test") != -1:
	is_test = 'yes'
	else:
	is_test = 'no'

	# test for interactive
	is_interactive = ''
	if abs_url.find("interactive") != -1:
	is_interactive = 'yes'
	else:
	is_interactive = 'no'

	# uncomment when you want to output things
	# use this to filter whatever type of pages

	if meta == None and is_test == 'no':
	row = [
	p.title,
	author_string,
	date,
	'http://www.texastribune.org' + abs_url,
	]
	out.writerow([unicode(item).encode('utf8') for item in row])