Skip to content

Instantly share code, notes, and snippets.

@jheasly
Created March 1, 2012 21:54
Show Gist options
  • Save jheasly/1953486 to your computer and use it in GitHub Desktop.
Save jheasly/1953486 to your computer and use it in GitHub Desktop.
Jython script to connect to Caché database, export stories to a Python pickle
#! /usr/bin/env /opt/local/bin/jython
# -*- coding: utf-8 -*-
from java.lang import *
from java.sql import *
def main():
import sys, datetime, cPickle, codecs
sys.path.append('/opt/local/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/site-packages/')
from BeautifulSoup import BeautifulSoup
if len(sys.argv) > 1:
eight_digit_date = sys.argv[1][:4] + "-" + sys.argv[1][4:6] + "-" + sys.argv[1][6:]
date_from_script = datetime.date(int(sys.argv[1][:4]), int(sys.argv[1][4:6]), int(sys.argv[1][6:]))
else:
today = datetime.date.today()
eight_digit_date = today.strftime("%Y-%m-%d")
date_from_script = today
outfile_eight_digit_date = eight_digit_date.replace('-','')
out_tag = codecs.open('/tmp/' + outfile_eight_digit_date + '.pickle', 'w', "utf-8")
sql = '''SELECT sty.storyId, cmsStory.Id, sty.storyname, api.pagesetname, api.letter, api.pagenum, totalDepth, author, origin, subcategoryid, seourl
FROM dbo.addbpageinfo api, dbo.storypageelements spe, dbo.story sty, dt_cms_schema.CMSStory
WHERE api.logicalPageId = spe.logicalPagesId
AND sty.storyId = spe.storyId
AND subCategoryId <> 0
AND NOT api.code = 'TMC'
AND (sty.statusId = 1018 or sty.statusId = 1019 or sty.statusId = 10)
AND cast (rundate as date) = cast ('%s' as date)
AND (numLines > 1 or words > 5)
AND (SELECT sum(isOnLayout) FROM dbo.storyelement WHERE storyid = sty.storyId) > 0
AND story->storyid=sty.storyid
GROUP BY sty.Id ORDER BY api.pageNum desc,totalDepth ASC''' % eight_digit_date
Class.forName("com.intersys.jdbc.CacheDriver").newInstance()
con = DriverManager.getConnection('jdbc:Cache://subdomain.domainname.com:60401/cms', '[username here]', '[password here]')
stmt = con.createStatement()
rs = stmt.executeQuery(sql)
# rs =
# storyId cmsId storyName pageSet letter page length author origin , subCatId, seoUrl
# 13071467 26030171 a10.int.haiti.0321 MAIN NEWS A 10 555.123779 RG WFM 51 ELECTION-PRESIDENT-EARTHQUAKE-HAITIAN-HAITIANS
out_list = []
while (rs.next()):
if rs.getString(11):
seoURL = rs.getString(11).encode('utf8')
else:
seoURL = rs.getString(11)
print rs.getString(1), rs.getString(2), rs.getString(3), rs.getString(4), rs.getString(5), rs.getString(6), rs.getString(7), rs.getString(8), rs.getString(9), rs.getString(10), seoURL
channel_text_list = []
out_record = {}
tstmt = con.createStatement()
tsql = '''SELECT channelText, xmlTagsId
FROM storyElement
WHERE storyId = %u
AND isOnLayout=1
AND xmltagsid IN (1, 2, 3, 4, 7, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 23, 24, 430717)
AND ((SELECT count(*) FROM storyElement WHERE storyId = %u AND isOnLayout=1 AND xmltagsid =3) = 1)''' % (int(rs.getString(1)), int(rs.getString(1)))
trs = tstmt.executeQuery(tsql)
while (trs.next()):
if trs.getString(1):
soup = BeautifulSoup(trs.getString(1))
try:
print trs.getString(1)[:50]
div_class = soup.findAll('div')[0]['class']
print ' ', div_class
text_string = trs.getString(1)
#
# http://codespeak.net/svn/pypy/dist/lib-python/modified-2.5.2/encodings/iso8859_8.py
#
text_string = text_string.replace(u'\xa0', '') # NO-BREAK SPACE, from table at above page
text_string = text_string.replace(u'\xad', '') # SOFT HYPHEN, from table at above page
text_string = text_string.replace(u'&amp;', '&') # SOFT HYPHEN, from table at above page
text_string = text_string.encode('utf8')
channel_text_list.append({div_class: text_string})
except TypeError:
print '>>> Element', trs.getString(2), 'missing from', rs.getString(3)
out_record = {'channel_text': channel_text_list }
out_record.update({
'storyId': rs.getString(1),
'cmsStoryId': rs.getString(2),
'storyName': rs.getString(3),
'pageSetName': rs.getString(4),
'letter': rs.getString(5),
'pageNum': rs.getString(6),
'totalDepth': rs.getString(7),
'author': rs.getString(8),
'origin': rs.getString(9),
'subCategory': rs.getString(10),
'seoURL': seoURL,
})
print out_record
out_list.append(out_record)
cPickle.dump(out_list, out_tag)
tstmt.close()
trs.close()
rs.close()
stmt.close()
con.close()
out_tag.close()
if __name__ == "__main__" : main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment