|
#!/usr/bin/env python |
|
import re |
|
import urllib2 |
|
import argparse |
|
from datetime import date, timedelta |
|
from bs4 import BeautifulSoup |
|
import pytumblr |
|
|
|
client = pytumblr.TumblrRestClient( |
|
'<consumer_key>', |
|
'<consumer_secret>', |
|
'<oauth_token>', |
|
'<oauth_secret>', |
|
) |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('date', type=int, |
|
nargs='?', |
|
help='Number of days in the past. Leave blank for today.') |
|
args = parser.parse_args() |
|
|
|
if args.date: |
|
apod_date = date.today() - timedelta(args.date) |
|
else: |
|
apod_date = date.today() |
|
|
|
apod_home_url = 'http://apod.nasa.gov/apod/' |
|
apod_today_url = apod_home_url + 'ap' \ |
|
+ apod_date.strftime('%y%m%d') \ |
|
+ '.html' |
|
apod_clean_url = re.sub('http://', '', apod_today_url) |
|
apod = urllib2.urlopen(apod_today_url).read() |
|
soup = BeautifulSoup(apod, 'html5lib') |
|
|
|
for a in soup.find_all('a'): |
|
# Append `apod_home_url` to any url that doesn't start with `http`. |
|
if re.match('^[^http]', a['href']): |
|
a['href'] = apod_home_url + a['href'] |
|
# Get rid of line breaks that show up inside an `href`. |
|
a['href'] = re.sub('\n', '', a['href']) |
|
|
|
# Strip all the leading and trailing whitespace inside tags. |
|
for b in soup.find_all('b'): |
|
if b.string: |
|
b.string.replace_with(b.string.strip()) |
|
for p in soup.find_all('p'): |
|
if p.string: |
|
p.string.replace_with(p.string.strip()) |
|
for i in soup.find_all('i'): |
|
if i.string: |
|
i.string.replace_with(i.string.strip()) |
|
|
|
title = soup.select('center + center > b:nth-of-type(1)')[0]\ |
|
.get_text(strip=True) |
|
|
|
# Markdown heading with the date of this entry. |
|
dateheading = '# ' + soup.select('center:nth-of-type(1) p:nth-of-type(2)')[0]\ |
|
.get_text(strip=True) |
|
content1 = str(soup.select('center + center')[0])\ |
|
.replace('<center>', '')\ |
|
.replace('</center>', '')\ |
|
.strip() |
|
content1 = re.sub('\n+', ' ', content1) |
|
content1 = re.sub(' +', ' ', content1) |
|
content2 = str(soup.select('center + p')[0])\ |
|
.replace('<p>', '')\ |
|
.replace('</p>', '')\ |
|
.strip() |
|
content2 = re.sub('\n+', ' ', content2) |
|
content2 = re.sub(' +', ' ', content2) |
|
content3 = '∞ Source: <a href="' + apod_today_url + '">'\ |
|
+ apod_clean_url + '</a>' |
|
|
|
caption = dateheading +\ |
|
'\n\n' + content1.decode('utf-8') +\ |
|
'\n\n' + content2.decode('utf-8') +\ |
|
'\n\n' + content3.decode('utf-8') |
|
|
|
if soup.select('center:nth-of-type(1) p:nth-of-type(2) a'): |
|
# There's an image here. |
|
image = soup.select('center:nth-of-type(1) p:nth-of-type(2) a')[0]['href'] |
|
|
|
client.create_photo('apod', source=image.encode('utf-8'), |
|
caption=caption.encode('utf-8'), |
|
slug=title.encode('utf-8'), |
|
format='markdown') |
|
|
|
else: |
|
# No image for this one. It's hopefully a YouTube video. |
|
image = soup.select( |
|
'center:nth-of-type(1) p:nth-of-type(2) iframe' |
|
)[0]['src'].replace('/embed/', '/watch?v=').replace('?rel=0', '') |
|
|
|
client.create_video('apod', embed=image.encode('utf-8'), |
|
caption=caption.encode('utf-8'), |
|
slug=title.encode('utf-8'), |
|
format='markdown') |
|
|
|
# print image + '\n' |
|
# print title + '\n' |
|
# print apod_today_url + '\n' |
|
# print dateheading + '\n' |
|
# print content1 + '\n' |
|
# print content2 |
|
# print caption |