Created
March 1, 2012 21:54
-
-
Save jheasly/1953486 to your computer and use it in GitHub Desktop.
Jython script to connect to Caché database, export stories to a Python pickle
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env /opt/local/bin/jython | |
# -*- coding: utf-8 -*- | |
from java.lang import * | |
from java.sql import * | |
def main(): | |
import sys, datetime, cPickle, codecs | |
sys.path.append('/opt/local/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/site-packages/') | |
from BeautifulSoup import BeautifulSoup | |
if len(sys.argv) > 1: | |
eight_digit_date = sys.argv[1][:4] + "-" + sys.argv[1][4:6] + "-" + sys.argv[1][6:] | |
date_from_script = datetime.date(int(sys.argv[1][:4]), int(sys.argv[1][4:6]), int(sys.argv[1][6:])) | |
else: | |
today = datetime.date.today() | |
eight_digit_date = today.strftime("%Y-%m-%d") | |
date_from_script = today | |
outfile_eight_digit_date = eight_digit_date.replace('-','') | |
out_tag = codecs.open('/tmp/' + outfile_eight_digit_date + '.pickle', 'w', "utf-8") | |
sql = '''SELECT sty.storyId, cmsStory.Id, sty.storyname, api.pagesetname, api.letter, api.pagenum, totalDepth, author, origin, subcategoryid, seourl | |
FROM dbo.addbpageinfo api, dbo.storypageelements spe, dbo.story sty, dt_cms_schema.CMSStory | |
WHERE api.logicalPageId = spe.logicalPagesId | |
AND sty.storyId = spe.storyId | |
AND subCategoryId <> 0 | |
AND NOT api.code = 'TMC' | |
AND (sty.statusId = 1018 or sty.statusId = 1019 or sty.statusId = 10) | |
AND cast (rundate as date) = cast ('%s' as date) | |
AND (numLines > 1 or words > 5) | |
AND (SELECT sum(isOnLayout) FROM dbo.storyelement WHERE storyid = sty.storyId) > 0 | |
AND story->storyid=sty.storyid | |
GROUP BY sty.Id ORDER BY api.pageNum desc,totalDepth ASC''' % eight_digit_date | |
Class.forName("com.intersys.jdbc.CacheDriver").newInstance() | |
con = DriverManager.getConnection('jdbc:Cache://subdomain.domainname.com:60401/cms', '[username here]', '[password here]') | |
stmt = con.createStatement() | |
rs = stmt.executeQuery(sql) | |
# rs = | |
# storyId cmsId storyName pageSet letter page length author origin , subCatId, seoUrl | |
# 13071467 26030171 a10.int.haiti.0321 MAIN NEWS A 10 555.123779 RG WFM 51 ELECTION-PRESIDENT-EARTHQUAKE-HAITIAN-HAITIANS | |
out_list = [] | |
while (rs.next()): | |
if rs.getString(11): | |
seoURL = rs.getString(11).encode('utf8') | |
else: | |
seoURL = rs.getString(11) | |
print rs.getString(1), rs.getString(2), rs.getString(3), rs.getString(4), rs.getString(5), rs.getString(6), rs.getString(7), rs.getString(8), rs.getString(9), rs.getString(10), seoURL | |
channel_text_list = [] | |
out_record = {} | |
tstmt = con.createStatement() | |
tsql = '''SELECT channelText, xmlTagsId | |
FROM storyElement | |
WHERE storyId = %u | |
AND isOnLayout=1 | |
AND xmltagsid IN (1, 2, 3, 4, 7, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 23, 24, 430717) | |
AND ((SELECT count(*) FROM storyElement WHERE storyId = %u AND isOnLayout=1 AND xmltagsid =3) = 1)''' % (int(rs.getString(1)), int(rs.getString(1))) | |
trs = tstmt.executeQuery(tsql) | |
while (trs.next()): | |
if trs.getString(1): | |
soup = BeautifulSoup(trs.getString(1)) | |
try: | |
print trs.getString(1)[:50] | |
div_class = soup.findAll('div')[0]['class'] | |
print ' ', div_class | |
text_string = trs.getString(1) | |
# | |
# http://codespeak.net/svn/pypy/dist/lib-python/modified-2.5.2/encodings/iso8859_8.py | |
# | |
text_string = text_string.replace(u'\xa0', '') # NO-BREAK SPACE, from table at above page | |
text_string = text_string.replace(u'\xad', '') # SOFT HYPHEN, from table at above page | |
text_string = text_string.replace(u'&', '&') # SOFT HYPHEN, from table at above page | |
text_string = text_string.encode('utf8') | |
channel_text_list.append({div_class: text_string}) | |
except TypeError: | |
print '>>> Element', trs.getString(2), 'missing from', rs.getString(3) | |
out_record = {'channel_text': channel_text_list } | |
out_record.update({ | |
'storyId': rs.getString(1), | |
'cmsStoryId': rs.getString(2), | |
'storyName': rs.getString(3), | |
'pageSetName': rs.getString(4), | |
'letter': rs.getString(5), | |
'pageNum': rs.getString(6), | |
'totalDepth': rs.getString(7), | |
'author': rs.getString(8), | |
'origin': rs.getString(9), | |
'subCategory': rs.getString(10), | |
'seoURL': seoURL, | |
}) | |
print out_record | |
out_list.append(out_record) | |
cPickle.dump(out_list, out_tag) | |
tstmt.close() | |
trs.close() | |
rs.close() | |
stmt.close() | |
con.close() | |
out_tag.close() | |
if __name__ == "__main__" : main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment