Skip to content

Instantly share code, notes, and snippets.

@redshiftzero
Last active February 11, 2017 22:02
Show Gist options
  • Save redshiftzero/ff43091d62bbcf126c4519128faef965 to your computer and use it in GitHub Desktop.
Save redshiftzero/ff43091d62bbcf126c4519128faef965 to your computer and use it in GitHub Desktop.
Data Refuge SF Bay 2017: Scraper for precipitation data on http://svs.gsfc.nasa.gov/Gallery/GPM.html
import bs4 as BeautifulSoup
import os
import pandas as pd
import time
from tqdm import tqdm
from urllib.request import urlopen
import werkzeug
import wget
def create_list_of_urls_to_scrape(index_page):
"""Get all the links to URLs we want to grab the data from
on the page http://svs.gsfc.nasa.gov/Gallery/GPM.html
"""
index_html = urlopen(index_page)
index = BeautifulSoup.BeautifulSoup(index_html, "lxml")
data_sections = index.find_all('div', attrs={'class': 'section-item'})
links = []
for section_item in data_sections:
for link in section_item.find_all('a'):
# Split on the '#' character and take the part without anchors
# because we'll be grabbing everything on the page anyway
links.append(link['href'].split('#')[0])
# Remove duplicate links
links = list(set(links))
df = pd.DataFrame({'Link': links})
df.to_csv('index_links_scraped.csv')
return links
def create_subdir(folder_name):
if not os.path.exists(folder_name):
os.makedirs(folder_name)
def get_data_links_on_page(page_to_scrape):
"""Get the data links we want on the individual page,
e.g. http://svs.gsfc.nasa.gov/4512
"""
detail_html = urlopen(page_to_scrape)
detail = BeautifulSoup.BeautifulSoup(detail_html, "lxml")
dropdowns = detail.find_all('ul', attrs={'class': 'dropdown-menu'})
links = []
for dropdown in dropdowns:
for link in dropdown.find_all('a'):
if link.get('href'):
links.append(link['href'])
# Remove duplicate links because downloading a big video twice
# would be annoying af
links = list(set(links))
return links
def main():
base_url = 'http://svs.gsfc.nasa.gov'
index_route = '/Gallery/GPM.html'
index_links = create_list_of_urls_to_scrape(base_url + index_route)
#index_links = pd.read_csv('index_links_scraped.csv')
for link in tqdm(index_links):
folder_name = werkzeug.secure_filename(link)
create_subdir(folder_name)
data_links = get_data_links_on_page(base_url + link)
os.chdir(folder_name)
for data_link in data_links:
wget.download(base_url + data_link)
time.sleep(2) # I'm not a robot at all
os.chdir('..')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment