Skip to content

Instantly share code, notes, and snippets.

@hrwgc
Last active December 12, 2015 02:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hrwgc/4700491 to your computer and use it in GitHub Desktop.
Save hrwgc/4700491 to your computer and use it in GitHub Desktop.
Create SQLite archive of NASA image of the day blog #Python #BeautifulSoup #SQLite
import scraperwiki
import urllib2, urllib
import re, os
import sqlite3
import uuid
from bs4 import *
import lxml
import unicodedata
from time import sleep
###
scraperwiki.sqlite.execute('DROP TABLE IF EXISTS nasa;')
scraperwiki.sqlite.execute('CREATE TABLE IF NOT EXISTS nasa (`article_url` TEXT PRIMARY KEY, `article_title` TEXT, `article_date` TEXT, `article_image` TEXT, `image_caption` TEXT, `article_keywords` TEXT, `parent_page_url` TEXT, `instrument_title` TEXT, `article_contents` BLOB, `credit` TEXT );')
###
baseUrl = "http://earthobservatory.nasa.gov/IOTD/index.php?"
###
year = 2001
while year <= 2013:
for month in range(1,12):
url = baseUrl + "m=" + str(month) + "&y=" + str(year)
print(url)
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)
grid = soup.find('div',attrs={"class":"grid-mid"})
articles = grid.findAll('a')
for ix in range(len(articles)):
record = {}
record['parent_page_url'] = url
#record['uid'] = uuid.uuid1()
article_url = articles[ix].get('href')
date_str = articles[ix].parent.get_text()
record['article_date'] = re.sub('^\n{0,}([^ \n]+)[ ]{1,}([^ ,\n]+),*[ ]{1,}([0-9]{4,4})\n.*$', r'\1 \2, \3', date_str).replace('\n', '')
print record['article_date']
response = urllib2.urlopen("http://earthobservatory.nasa.gov/IOTD/" + article_url)
html = response.read()
soup = BeautifulSoup(html)
record['article_url'] = "http://earthobservatory.nasa.gov/IOTD/" + article_url
record['article_title'] = soup.title.text.replace('\n|\r', '').replace(' : Image of the Day','')
record['article_keywords'] = ""
if soup.dd != None and soup.dl.dd != None:
record['instrument_title'] = soup.dl.dd.text
else:
record['instrument_title'] = ""
if soup.find('p',attrs={"class":"credit"}) != None:
record['credit'] = soup.find('p',attrs={"class":"credit"}).text
else:
record['credit'] = ""
record['article_contents'] = soup.find('div',attrs={"class": "stnd-desc globalimages"}).get_text()
record['article_image'] = soup.find('div',attrs={"class":"headimage-detail"}).img.get('src')
record['image_caption'] = soup.find('div',attrs={"class":"headimage-detail"}).img.get('alt')
try:
#scraperwiki.sqlite.save(['article_url'], record, table_name='nasa')
scraperwiki.sqlite.execute('INSERT OR IGNORE into nasa values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',(record['article_url'], record['article_title'], record['article_date'], record['article_image'], record['image_caption'],record['article_keywords'], record['parent_page_url'], record['instrument_title'], record['article_contents'], record['credit']))
scraperwiki.sqlite.commit()
print (record['article_title'], record['article_date'], "SUCCESS")
except:
print("fail")
sleep(0.5)
year += 1
scraperwiki.sqlite.execute('CREATE UNIQUE INDEX IF NOT EXISTS uqtitle ON nasa (`article_title`)')
scraperwiki.sqlite.execute('ANALYZE')
scraperwiki.sqlite.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment