Skip to content

Instantly share code, notes, and snippets.

@jheasly
Created March 1, 2012 21:55
Show Gist options
  • Save jheasly/1953501 to your computer and use it in GitHub Desktop.
Save jheasly/1953501 to your computer and use it in GitHub Desktop.
Unpickle, insert into Django postgresql db.
#!/usr/bin/python
# -*- coding: utf-8 -*-
from BeautifulSoup import BeautifulSoup
import tagfile, datetime, sys, re, pickle, psycopg2
from os import environ
sys.path.append('/directory/nuther_directory')
sys.path.append('/directory/nuther_directory/project_directory')
environ['DJANGO_SETTINGS_MODULE'] = 'project_directory.settings'
from turin.models import Story, Image, Mugline
from django.contrib.sites.models import Site
from django.conf import settings
from django.core.mail import send_mail
from django.template.defaultfilters import slugify, striptags
from django.db import connection
from django.db.utils import DatabaseError
from django.db import utils
# The order of and which items to try to create a slug:
# Note: When you add an element here, make sure you add
# a parallel element to lines 96 -101 below.
slug_list = [
'Headline',
'Deck',
'Header',
'StorySig',
'Mugline',
'FurnitureText',
'QuickRead',
]
def main():
# for lower-casing StorySig HTML entities (almost always & => &)
tag_lower = re.compile('&([^;]+);')
def text_lower(q):
return q.group(0).lower()
def clean_text(q):
q = striptags(q)
q = re.sub('H[H]+', '', q)
q = q.replace('\r', ' ')
q = q.strip().upper()
q = tag_lower.sub(text_lower, q)
return q
if len(sys.argv) > 1:
my_eight_digit_date = sys.argv[1]
date_from_script = datetime.date(int(sys.argv[1][:4]), int(sys.argv[1][4:6]), int(sys.argv[1][6:]))
else:
today = datetime.date.today()
my_eight_digit_date = today.strftime("%Y%m%d")
date_from_script = today
# b = tagfile.tagfile(open("/path/to/imported/pickle/file/pickled/" + my_eight_digit_date + ".pickle"))
try:
b = open("/path/to/imported/pickle/file/pickled/" + my_eight_digit_date + ".pickle", "r")
except IOError: # something bad happened; no file to open
message = '%s; no paper to insert today.' % my_eight_digit_date
send_mail('Paper import didn\'t happen', message, 'recipient@email.com', ['recipient@email.com',], fail_silently=False)
return
rec_list = pickle.load(b)
for current_rec in rec_list:
'''
Each record has each story's text areas in a list of dictionaries
with a key of 'channel_text'. For convenience, we consolidate all the
text areas in one dictionary.
Problem is that we can't use the consolidated dictionary for all items,
as there can be more than one Mug, Mugline, QuickFact, etc. But it is
good for the one-per-story text areas.
'''
# if current_rec['storyName'] == 'c1.sp.womenadvance.0516':
# print current_rec
# if current_rec['storyName'] == 'c1.sp.womenadvance.0516':
consolidated_dict = {}
slug = ''
story_id = None
# convert None seoURL's into empty string/''
if current_rec.has_key('seoURL'):
if current_rec['seoURL']:
seo_url = current_rec['seoURL']
else:
# Hack for Blue Chip subtlety, see bb.bz.propsales.0401
seo_url = u''
else:
seo_url = u''
s = None
for item in current_rec['channel_text']:
consolidated_dict.update(item)
if consolidated_dict.has_key('Text') and (
consolidated_dict.has_key('Headline') or
consolidated_dict.has_key('Deck') or
consolidated_dict.has_key('Header') or
consolidated_dict.has_key('StorySig') or
consolidated_dict.has_key('Mugline') or
consolidated_dict.has_key('FurnitureText') or
consolidated_dict.has_key('QuickRead')
): # Filter out unwanted stories
author = current_rec['author']
if not author:
author = u''
origin = current_rec['origin']
if not origin:
origin = u''
story_instance = Story(
filename = current_rec['storyName'],
run_date = date_from_script,
turin_id = current_rec['storyId'],
cms_id = current_rec['cmsStoryId'],
page = int(current_rec['pageNum']),
section_letter = current_rec['letter'],
page_set = current_rec['pageSetName'],
lede = False,
live = True,
length = str(current_rec['totalDepth']),
author = author,
origin = origin,
subcategory = current_rec['subCategory'],
seo_url = seo_url,
)
if consolidated_dict.has_key('Headline'):
consolidated_dict['Headline'] = striptags(consolidated_dict['Headline'])
consolidated_dict['Headline'] = consolidated_dict['Headline'].replace('\n', ' ')
consolidated_dict['Headline'] = consolidated_dict['Headline'].replace(u'\xc3\xa9', u'\xe9')
story_instance.headline = consolidated_dict['Headline'].strip()
# story_instance.slug = slugify(striptags(text_dict['Headline']))
if consolidated_dict.has_key('Header'):
story_instance.header = consolidated_dict['Header']
if consolidated_dict.has_key('Deck'):
story_instance.deck = consolidated_dict['Deck']
if consolidated_dict.has_key('FurnitureText'):
story_instance.furniture_text = consolidated_dict['FurnitureText']
if consolidated_dict.has_key('Text'):
story_instance.text = consolidated_dict['Text']
if consolidated_dict.has_key('Byline'):
story_instance.byline = consolidated_dict['Byline']
if consolidated_dict.has_key('QuickRead'):
story_instance.quickread = consolidated_dict['QuickRead']
if consolidated_dict.has_key('Pullquote'):
story_instance.pullquote = consolidated_dict['Pullquote']
if consolidated_dict.has_key('StorySig'):
story_instance.story_sig = consolidated_dict['StorySig']
if consolidated_dict.has_key('Tagline'):
story_instance.tagline = consolidated_dict['Tagline']
# Figger out what the slug should be.
for DT_element in slug_list:
slug = consolidated_dict.get(DT_element, '')
if slug:
'''
trim slug at first 200 characters to take care of
FurnitureText slugs like:
'how-to-publish-an-anniversary-the-register-guard-publishes-
announcements-about-local-couples-celebrating-anniversaries-
of-50-years-60-years-65-years-70-years-or-more-obtain-forms-
at-our-office-at-3500-chad-drive-on-the-web-at-
wwwregisterguardcomrgformshtml-or-by-mailing-a-self-
addressed-stamped-envelope-to-anniversaries-the-register-
guard-po-box-10188-eugene-or-97440'
'''
story_instance.slug = slugify(clean_text(slug))[:200]
break
# Series of hacks to create unique slugs for stories that get
# their slugs from story sigs, which aren't unique enough
digest_text_test = consolidated_dict.get('Text', '')
if digest_text_test and digest_text_test.count('<p class="Headlines-DigestHead">') \
and not current_rec['storyName'].count('cr.obit'):
digest_soup = BeautifulSoup(digest_text_test)
digest_head_text = digest_soup.findAll('p', {'class': 'Headlines-DigestHead'})
if digest_head_text:
slug = digest_head_text[0].contents[0]
story_instance.slug = (story_instance.slug + '-' + slugify(slug))[:200]
# Hack to append headline of first letter to
# 'letters-in-the-editors-mailbag' slug to create unique slug(s)
# on Saturdays when otherwise there'd be two files with same slug
if digest_text_test and digest_text_test.count('<p class="Editorial-Edit_LetterHead">'):
digest_soup = BeautifulSoup(digest_text_test)
digest_head_text = digest_soup.findAll('p', {'class': 'Editorial-Edit_LetterHead'})
if digest_head_text:
slug = digest_head_text[0].contents[0]
story_instance.slug = (story_instance.slug + '-' + slugify(slug))[:200]
# Hack for scoreboards
if digest_text_test \
and ( digest_text_test.count('<p class="StandingHeads-Style_8_ReverseBar_Inline">') \
or digest_text_test.count('<p class="AgateSports-Head_1">Standings</p>') ) \
and current_rec['storyName'].count('board'):
digest_soup = BeautifulSoup(digest_text_test)
digest_head_text = digest_soup.findAll('p', {'class': 'StandingHeads-Style_8_ReverseBar_Inline'})
if digest_head_text:
slug = digest_head_text[0].contents[0]
story_instance.slug = (story_instance.slug + '-' + slugify(slug))[:200]
elif digest_text_test.count('<p class="AgateSports-Head_1">'):
digest_head_text = digest_soup.findAll('p', {'class': 'AgateSports-Head_1'})
slug = digest_head_text[0].contents[0]
story_instance.slug = (story_instance.slug + '-' + slugify(slug))[:200]
# Hack for calendars
if digest_text_test and digest_text_test.count('<p class="AgateNews-Agate_News_Head10">') \
and current_rec['storyName'].count('calendar'):
digest_soup = BeautifulSoup(digest_text_test)
digest_head_text = digest_soup.findAll('p', {'class': 'AgateNews-Agate_News_Head10'})
if digest_head_text:
slug = digest_head_text[0].contents[0]
story_instance.slug = (story_instance.slug + '-' + slugify(slug))[:200]
try:
story_instance.save()
except psycopg2.ProgrammingError:
print 'Bad story:', current_rec['storyName']
print 'The slug:', story_instance.slug
print 'Consolidated dict:', consolidated_dict
print connection.queries
print 'seo_url:', story_instance.seo_url
except DatabaseError, e:
print 'Bad story:', current_rec['storyName']
raise utils.DatabaseError, utils.DatabaseError(*tuple(e)), sys.exc_info()[2]
story_id = story_instance.id
s = Story.objects.get(id = story_id)
print 'Story', s.filename, 'saved.'
# Make sure there's no out-of-sync between what was successfully saved and the current_rec.
if current_rec.has_key('channel_text') and current_rec['storyName'] == s.filename:
for text_dict in current_rec['channel_text']:
if text_dict.has_key('5'):
soup = BeautifulSoup(text_dict['5'])
get_class = soup.findAll('div', '5')
for item in get_class:
s.image_set.create(filename = item.contents[1].string.strip())
if text_dict.has_key('Caption'):
soup = BeautifulSoup(text_dict['Caption'])
get_class = soup.findAll('div', 'Caption')
for item in get_class:
s.caption_set.create(caption = item.prettify())
#
# Putting PhotoCredit's in Caption class
#
if text_dict.has_key('PhotoCredit'):
soup = BeautifulSoup(text_dict['PhotoCredit'])
get_class = soup.findAll('div', 'PhotoCredit')
for item in get_class:
s.caption_set.create(caption = item.prettify())
if text_dict.has_key('15'):
soup = BeautifulSoup(text_dict['15'])
get_class = soup.findAll('div', '15')
for item in get_class:
s.mug_set.create(filename = item.contents[1].string.strip())
if text_dict.has_key('Mugline'):
soup = BeautifulSoup(text_dict['Mugline'])
get_class = soup.findAll('div', 'Mugline')
for item in get_class:
mugline_content = ' '.join([m.strip() for m in item.recursiveChildGenerator() if (isinstance(m, unicode) and not m == '\n')])
s.mugline_set.create(text = mugline_content)
if text_dict.has_key('QuickFacts'):
soup = BeautifulSoup(text_dict['QuickFacts'])
get_class = soup.findAll('div', 'QuickFacts')
for item in get_class:
s.quickfact_set.create(text = item.prettify())
b.close()
if __name__ == "__main__" : main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment