Created
March 1, 2012 21:55
-
-
Save jheasly/1953501 to your computer and use it in GitHub Desktop.
Unpickle, insert into Django postgresql db.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
from BeautifulSoup import BeautifulSoup | |
import tagfile, datetime, sys, re, pickle, psycopg2 | |
from os import environ | |
sys.path.append('/directory/nuther_directory') | |
sys.path.append('/directory/nuther_directory/project_directory') | |
environ['DJANGO_SETTINGS_MODULE'] = 'project_directory.settings' | |
from turin.models import Story, Image, Mugline | |
from django.contrib.sites.models import Site | |
from django.conf import settings | |
from django.core.mail import send_mail | |
from django.template.defaultfilters import slugify, striptags | |
from django.db import connection | |
from django.db.utils import DatabaseError | |
from django.db import utils | |
# The order of and which items to try to create a slug: | |
# Note: When you add an element here, make sure you add | |
# a parallel element to lines 96 -101 below. | |
slug_list = [ | |
'Headline', | |
'Deck', | |
'Header', | |
'StorySig', | |
'Mugline', | |
'FurnitureText', | |
'QuickRead', | |
] | |
def main(): | |
# for lower-casing StorySig HTML entities (almost always & => &) | |
tag_lower = re.compile('&([^;]+);') | |
def text_lower(q): | |
return q.group(0).lower() | |
def clean_text(q): | |
q = striptags(q) | |
q = re.sub('H[H]+', '', q) | |
q = q.replace('\r', ' ') | |
q = q.strip().upper() | |
q = tag_lower.sub(text_lower, q) | |
return q | |
if len(sys.argv) > 1: | |
my_eight_digit_date = sys.argv[1] | |
date_from_script = datetime.date(int(sys.argv[1][:4]), int(sys.argv[1][4:6]), int(sys.argv[1][6:])) | |
else: | |
today = datetime.date.today() | |
my_eight_digit_date = today.strftime("%Y%m%d") | |
date_from_script = today | |
# b = tagfile.tagfile(open("/path/to/imported/pickle/file/pickled/" + my_eight_digit_date + ".pickle")) | |
try: | |
b = open("/path/to/imported/pickle/file/pickled/" + my_eight_digit_date + ".pickle", "r") | |
except IOError: # something bad happened; no file to open | |
message = '%s; no paper to insert today.' % my_eight_digit_date | |
send_mail('Paper import didn\'t happen', message, 'recipient@email.com', ['recipient@email.com',], fail_silently=False) | |
return | |
rec_list = pickle.load(b) | |
for current_rec in rec_list: | |
''' | |
Each record has each story's text areas in a list of dictionaries | |
with a key of 'channel_text'. For convenience, we consolidate all the | |
text areas in one dictionary. | |
Problem is that we can't use the consolidated dictionary for all items, | |
as there can be more than one Mug, Mugline, QuickFact, etc. But it is | |
good for the one-per-story text areas. | |
''' | |
# if current_rec['storyName'] == 'c1.sp.womenadvance.0516': | |
# print current_rec | |
# if current_rec['storyName'] == 'c1.sp.womenadvance.0516': | |
consolidated_dict = {} | |
slug = '' | |
story_id = None | |
# convert None seoURL's into empty string/'' | |
if current_rec.has_key('seoURL'): | |
if current_rec['seoURL']: | |
seo_url = current_rec['seoURL'] | |
else: | |
# Hack for Blue Chip subtlety, see bb.bz.propsales.0401 | |
seo_url = u'' | |
else: | |
seo_url = u'' | |
s = None | |
for item in current_rec['channel_text']: | |
consolidated_dict.update(item) | |
if consolidated_dict.has_key('Text') and ( | |
consolidated_dict.has_key('Headline') or | |
consolidated_dict.has_key('Deck') or | |
consolidated_dict.has_key('Header') or | |
consolidated_dict.has_key('StorySig') or | |
consolidated_dict.has_key('Mugline') or | |
consolidated_dict.has_key('FurnitureText') or | |
consolidated_dict.has_key('QuickRead') | |
): # Filter out unwanted stories | |
author = current_rec['author'] | |
if not author: | |
author = u'' | |
origin = current_rec['origin'] | |
if not origin: | |
origin = u'' | |
story_instance = Story( | |
filename = current_rec['storyName'], | |
run_date = date_from_script, | |
turin_id = current_rec['storyId'], | |
cms_id = current_rec['cmsStoryId'], | |
page = int(current_rec['pageNum']), | |
section_letter = current_rec['letter'], | |
page_set = current_rec['pageSetName'], | |
lede = False, | |
live = True, | |
length = str(current_rec['totalDepth']), | |
author = author, | |
origin = origin, | |
subcategory = current_rec['subCategory'], | |
seo_url = seo_url, | |
) | |
if consolidated_dict.has_key('Headline'): | |
consolidated_dict['Headline'] = striptags(consolidated_dict['Headline']) | |
consolidated_dict['Headline'] = consolidated_dict['Headline'].replace('\n', ' ') | |
consolidated_dict['Headline'] = consolidated_dict['Headline'].replace(u'\xc3\xa9', u'\xe9') | |
story_instance.headline = consolidated_dict['Headline'].strip() | |
# story_instance.slug = slugify(striptags(text_dict['Headline'])) | |
if consolidated_dict.has_key('Header'): | |
story_instance.header = consolidated_dict['Header'] | |
if consolidated_dict.has_key('Deck'): | |
story_instance.deck = consolidated_dict['Deck'] | |
if consolidated_dict.has_key('FurnitureText'): | |
story_instance.furniture_text = consolidated_dict['FurnitureText'] | |
if consolidated_dict.has_key('Text'): | |
story_instance.text = consolidated_dict['Text'] | |
if consolidated_dict.has_key('Byline'): | |
story_instance.byline = consolidated_dict['Byline'] | |
if consolidated_dict.has_key('QuickRead'): | |
story_instance.quickread = consolidated_dict['QuickRead'] | |
if consolidated_dict.has_key('Pullquote'): | |
story_instance.pullquote = consolidated_dict['Pullquote'] | |
if consolidated_dict.has_key('StorySig'): | |
story_instance.story_sig = consolidated_dict['StorySig'] | |
if consolidated_dict.has_key('Tagline'): | |
story_instance.tagline = consolidated_dict['Tagline'] | |
# Figger out what the slug should be. | |
for DT_element in slug_list: | |
slug = consolidated_dict.get(DT_element, '') | |
if slug: | |
''' | |
trim slug at first 200 characters to take care of | |
FurnitureText slugs like: | |
'how-to-publish-an-anniversary-the-register-guard-publishes- | |
announcements-about-local-couples-celebrating-anniversaries- | |
of-50-years-60-years-65-years-70-years-or-more-obtain-forms- | |
at-our-office-at-3500-chad-drive-on-the-web-at- | |
wwwregisterguardcomrgformshtml-or-by-mailing-a-self- | |
addressed-stamped-envelope-to-anniversaries-the-register- | |
guard-po-box-10188-eugene-or-97440' | |
''' | |
story_instance.slug = slugify(clean_text(slug))[:200] | |
break | |
# Series of hacks to create unique slugs for stories that get | |
# their slugs from story sigs, which aren't unique enough | |
digest_text_test = consolidated_dict.get('Text', '') | |
if digest_text_test and digest_text_test.count('<p class="Headlines-DigestHead">') \ | |
and not current_rec['storyName'].count('cr.obit'): | |
digest_soup = BeautifulSoup(digest_text_test) | |
digest_head_text = digest_soup.findAll('p', {'class': 'Headlines-DigestHead'}) | |
if digest_head_text: | |
slug = digest_head_text[0].contents[0] | |
story_instance.slug = (story_instance.slug + '-' + slugify(slug))[:200] | |
# Hack to append headline of first letter to | |
# 'letters-in-the-editors-mailbag' slug to create unique slug(s) | |
# on Saturdays when otherwise there'd be two files with same slug | |
if digest_text_test and digest_text_test.count('<p class="Editorial-Edit_LetterHead">'): | |
digest_soup = BeautifulSoup(digest_text_test) | |
digest_head_text = digest_soup.findAll('p', {'class': 'Editorial-Edit_LetterHead'}) | |
if digest_head_text: | |
slug = digest_head_text[0].contents[0] | |
story_instance.slug = (story_instance.slug + '-' + slugify(slug))[:200] | |
# Hack for scoreboards | |
if digest_text_test \ | |
and ( digest_text_test.count('<p class="StandingHeads-Style_8_ReverseBar_Inline">') \ | |
or digest_text_test.count('<p class="AgateSports-Head_1">Standings</p>') ) \ | |
and current_rec['storyName'].count('board'): | |
digest_soup = BeautifulSoup(digest_text_test) | |
digest_head_text = digest_soup.findAll('p', {'class': 'StandingHeads-Style_8_ReverseBar_Inline'}) | |
if digest_head_text: | |
slug = digest_head_text[0].contents[0] | |
story_instance.slug = (story_instance.slug + '-' + slugify(slug))[:200] | |
elif digest_text_test.count('<p class="AgateSports-Head_1">'): | |
digest_head_text = digest_soup.findAll('p', {'class': 'AgateSports-Head_1'}) | |
slug = digest_head_text[0].contents[0] | |
story_instance.slug = (story_instance.slug + '-' + slugify(slug))[:200] | |
# Hack for calendars | |
if digest_text_test and digest_text_test.count('<p class="AgateNews-Agate_News_Head10">') \ | |
and current_rec['storyName'].count('calendar'): | |
digest_soup = BeautifulSoup(digest_text_test) | |
digest_head_text = digest_soup.findAll('p', {'class': 'AgateNews-Agate_News_Head10'}) | |
if digest_head_text: | |
slug = digest_head_text[0].contents[0] | |
story_instance.slug = (story_instance.slug + '-' + slugify(slug))[:200] | |
try: | |
story_instance.save() | |
except psycopg2.ProgrammingError: | |
print 'Bad story:', current_rec['storyName'] | |
print 'The slug:', story_instance.slug | |
print 'Consolidated dict:', consolidated_dict | |
print connection.queries | |
print 'seo_url:', story_instance.seo_url | |
except DatabaseError, e: | |
print 'Bad story:', current_rec['storyName'] | |
raise utils.DatabaseError, utils.DatabaseError(*tuple(e)), sys.exc_info()[2] | |
story_id = story_instance.id | |
s = Story.objects.get(id = story_id) | |
print 'Story', s.filename, 'saved.' | |
# Make sure there's no out-of-sync between what was successfully saved and the current_rec. | |
if current_rec.has_key('channel_text') and current_rec['storyName'] == s.filename: | |
for text_dict in current_rec['channel_text']: | |
if text_dict.has_key('5'): | |
soup = BeautifulSoup(text_dict['5']) | |
get_class = soup.findAll('div', '5') | |
for item in get_class: | |
s.image_set.create(filename = item.contents[1].string.strip()) | |
if text_dict.has_key('Caption'): | |
soup = BeautifulSoup(text_dict['Caption']) | |
get_class = soup.findAll('div', 'Caption') | |
for item in get_class: | |
s.caption_set.create(caption = item.prettify()) | |
# | |
# Putting PhotoCredit's in Caption class | |
# | |
if text_dict.has_key('PhotoCredit'): | |
soup = BeautifulSoup(text_dict['PhotoCredit']) | |
get_class = soup.findAll('div', 'PhotoCredit') | |
for item in get_class: | |
s.caption_set.create(caption = item.prettify()) | |
if text_dict.has_key('15'): | |
soup = BeautifulSoup(text_dict['15']) | |
get_class = soup.findAll('div', '15') | |
for item in get_class: | |
s.mug_set.create(filename = item.contents[1].string.strip()) | |
if text_dict.has_key('Mugline'): | |
soup = BeautifulSoup(text_dict['Mugline']) | |
get_class = soup.findAll('div', 'Mugline') | |
for item in get_class: | |
mugline_content = ' '.join([m.strip() for m in item.recursiveChildGenerator() if (isinstance(m, unicode) and not m == '\n')]) | |
s.mugline_set.create(text = mugline_content) | |
if text_dict.has_key('QuickFacts'): | |
soup = BeautifulSoup(text_dict['QuickFacts']) | |
get_class = soup.findAll('div', 'QuickFacts') | |
for item in get_class: | |
s.quickfact_set.create(text = item.prettify()) | |
b.close() | |
if __name__ == "__main__" : main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment