Skip to content

Instantly share code, notes, and snippets.

@psychemedia
Created October 23, 2010 18:30
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save psychemedia/642527 to your computer and use it in GitHub Desktop.
Save psychemedia/642527 to your computer and use it in GitHub Desktop.
import re, urllib, simplejson,csv
import xml.sax.saxutils as saxutils
from urlparse import urlparse
fname="homepageurls2.csv"
def opmlFromCSV(fname):
fout="test"
fo=open(fout+'.xml','w')
writeOPMLHeadopenBody(fo)
f = csv.reader(open(fname, "rb"))
#url="http://ukwebfocus.wordpress.com"
first=True
curr=''
for line in f:
url,tag=line
if curr!=tag:
if first is True:
first=False
else:
closeOPMLoutline(fo)
curr=tag
openOPMLoutline(fo,tag)
handleOPMLitem(fo,url)
closeOPMLoutline(fo)
closeOPMLbody(fo)
fo.close()
def handleOPMLitem(fo,url):
if url !='':
try:
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', url)
for url in urls:
print "testing",url
o=url
url='http://query.yahooapis.com/v1/public/yql/psychemedia/feedautodetect?url='+urllib.quote(url)+'&format=json'
try:
data = simplejson.load(urllib.urlopen(url))
if data['query']['count']>'0':
print data['query']
if data['query']['count']=='1':
l=data['query']['results']['link']
furl=checkPathOnFeedURL(l['href'],o)
print "*****",furl,l['title']
handleFeedDetails(fo,furl)
else:
for r in data['query']['results']['link']:
furl=checkPathOnFeedURL(r['href'],o)
print furl,r['title']
handleFeedDetails(fo,furl)
except:
pass
except:
pass
def checkPathOnFeedURL(furl,o):
if furl.startswith('/'):
x = urlparse(o)
furl= 'http://'+x.netloc+furl
return furl
def writeOPMLHeadopenBody(fo):
fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
fo.write('<opml version="1.0">\n<head>\n\t<title>Generated OPML file</title>\n</head>\n\t<body>\n')
def closeOPMLbody(fo):
fo.write("</body>\n</opml>")
def openOPMLoutline(f,t):
f.write('\t\t<outline title="'+t+'" text="'+t+'">\n')
def closeOPMLoutline(f):
f.write('\t\t</outline>\n')
def writeOPMLitem(f,htmlurl,xmlurl,title):
title=saxutils.escape(title)
f.write('\t\t\t<outline text="'+title+'" title="'+title+'" type="rss" xmlUrl="'+xmlurl+'" htmlUrl="'+htmlurl+'"/>\n')
def handleFeedDetails(fo,furl):
nocomments=True
url='http://query.yahooapis.com/v1/public/yql/psychemedia/feeddetails?url='+urllib.quote(furl)+'&format=json'
print "Trying feed url",furl
try:
details=simplejson.load(urllib.urlopen(url))
detail=details['query']['results']['feed']
#print "Acquired",detail
for i in detail:
if i['link']['rel']=='alternate':
title=i['title'].encode('utf-8')
hlink=i['link']['href']
print 'Using',hlink, furl,title
if nocomments is True:
if not (furl.find('/comments')>-1 or title.startswith('Comments for')):
writeOPMLitem(fo,hlink,furl,title)
else:
writeOPMLitem(fo,hlink,furl,title)
return
except:
pass
#-------
opmlFromCSV(fname)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment