Skip to content

Instantly share code, notes, and snippets.

@trey
Last active January 7, 2017 17:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save trey/ab607c8ce955e583b1f3db7b3780ae29 to your computer and use it in GitHub Desktop.
Save trey/ab607c8ce955e583b1f3db7b3780ae29 to your computer and use it in GitHub Desktop.
What I used to use to scrape APOD for my Tumblr mirror. This is outdated, but I'm keeping it here for the sake of its history.
#!/usr/bin/env sh
cd $HOME/Code/apod
export WORKON_HOME=$HOME/.virtualenvs
export VIRTUALENVWRAPPER_PYTHON=/usr/local/bin/python
source /usr/local/bin/virtualenvwrapper.sh
workon apod
if [ $# -eq 1 ]
then
./scrape.py $1
else
./scrape.py
fi
deactivate

Set up a recurring task to run this script every morning at 3am:

crontab -e
0 3 * * *  ~/bin/common/apod

Then go into System Preferences > Energy Saver and set a schedule for

  • Start up or wake Every Day at 2:59 AM
  • Sleep Every Day at 3:01 AM
PyTumblr==0.0.6
beautifulsoup4==4.5.0
html5lib==1.0b10
httplib2==0.9.2
httpretty==0.8.14
oauth2==1.9.0.post1
pep8==1.7.0
six==1.10.0
webencodings==0.5
wsgiref==0.1.2
#!/usr/bin/env python
import re
import urllib2
import argparse
from datetime import date, timedelta
from bs4 import BeautifulSoup
import pytumblr
client = pytumblr.TumblrRestClient(
'<consumer_key>',
'<consumer_secret>',
'<oauth_token>',
'<oauth_secret>',
)
parser = argparse.ArgumentParser()
parser.add_argument('date', type=int,
nargs='?',
help='Number of days in the past. Leave blank for today.')
args = parser.parse_args()
if args.date:
apod_date = date.today() - timedelta(args.date)
else:
apod_date = date.today()
apod_home_url = 'http://apod.nasa.gov/apod/'
apod_today_url = apod_home_url + 'ap' \
+ apod_date.strftime('%y%m%d') \
+ '.html'
apod_clean_url = re.sub('http://', '', apod_today_url)
apod = urllib2.urlopen(apod_today_url).read()
soup = BeautifulSoup(apod, 'html5lib')
for a in soup.find_all('a'):
# Append `apod_home_url` to any url that doesn't start with `http`.
if re.match('^[^http]', a['href']):
a['href'] = apod_home_url + a['href']
# Get rid of line breaks that show up inside an `href`.
a['href'] = re.sub('\n', '', a['href'])
# Strip all the leading and trailing whitespace inside tags.
for b in soup.find_all('b'):
if b.string:
b.string.replace_with(b.string.strip())
for p in soup.find_all('p'):
if p.string:
p.string.replace_with(p.string.strip())
for i in soup.find_all('i'):
if i.string:
i.string.replace_with(i.string.strip())
title = soup.select('center + center > b:nth-of-type(1)')[0]\
.get_text(strip=True)
# Markdown heading with the date of this entry.
dateheading = '# ' + soup.select('center:nth-of-type(1) p:nth-of-type(2)')[0]\
.get_text(strip=True)
content1 = str(soup.select('center + center')[0])\
.replace('<center>', '')\
.replace('</center>', '')\
.strip()
content1 = re.sub('\n+', ' ', content1)
content1 = re.sub(' +', ' ', content1)
content2 = str(soup.select('center + p')[0])\
.replace('<p>', '')\
.replace('</p>', '')\
.strip()
content2 = re.sub('\n+', ' ', content2)
content2 = re.sub(' +', ' ', content2)
content3 = '&infin; Source: <a href="' + apod_today_url + '">'\
+ apod_clean_url + '</a>'
caption = dateheading +\
'\n\n' + content1.decode('utf-8') +\
'\n\n' + content2.decode('utf-8') +\
'\n\n' + content3.decode('utf-8')
if soup.select('center:nth-of-type(1) p:nth-of-type(2) a'):
# There's an image here.
image = soup.select('center:nth-of-type(1) p:nth-of-type(2) a')[0]['href']
client.create_photo('apod', source=image.encode('utf-8'),
caption=caption.encode('utf-8'),
slug=title.encode('utf-8'),
format='markdown')
else:
# No image for this one. It's hopefully a YouTube video.
image = soup.select(
'center:nth-of-type(1) p:nth-of-type(2) iframe'
)[0]['src'].replace('/embed/', '/watch?v=').replace('?rel=0', '')
client.create_video('apod', embed=image.encode('utf-8'),
caption=caption.encode('utf-8'),
slug=title.encode('utf-8'),
format='markdown')
# print image + '\n'
# print title + '\n'
# print apod_today_url + '\n'
# print dateheading + '\n'
# print content1 + '\n'
# print content2
# print caption
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment