Last active
February 11, 2017 22:02
-
-
Save redshiftzero/ff43091d62bbcf126c4519128faef965 to your computer and use it in GitHub Desktop.
Data Refuge SF Bay 2017: Scraper for precipitation data on http://svs.gsfc.nasa.gov/Gallery/GPM.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bs4 as BeautifulSoup | |
import os | |
import pandas as pd | |
import time | |
from tqdm import tqdm | |
from urllib.request import urlopen | |
import werkzeug | |
import wget | |
def create_list_of_urls_to_scrape(index_page): | |
"""Get all the links to URLs we want to grab the data from | |
on the page http://svs.gsfc.nasa.gov/Gallery/GPM.html | |
""" | |
index_html = urlopen(index_page) | |
index = BeautifulSoup.BeautifulSoup(index_html, "lxml") | |
data_sections = index.find_all('div', attrs={'class': 'section-item'}) | |
links = [] | |
for section_item in data_sections: | |
for link in section_item.find_all('a'): | |
# Split on the '#' character and take the part without anchors | |
# because we'll be grabbing everything on the page anyway | |
links.append(link['href'].split('#')[0]) | |
# Remove duplicate links | |
links = list(set(links)) | |
df = pd.DataFrame({'Link': links}) | |
df.to_csv('index_links_scraped.csv') | |
return links | |
def create_subdir(folder_name): | |
if not os.path.exists(folder_name): | |
os.makedirs(folder_name) | |
def get_data_links_on_page(page_to_scrape): | |
"""Get the data links we want on the individual page, | |
e.g. http://svs.gsfc.nasa.gov/4512 | |
""" | |
detail_html = urlopen(page_to_scrape) | |
detail = BeautifulSoup.BeautifulSoup(detail_html, "lxml") | |
dropdowns = detail.find_all('ul', attrs={'class': 'dropdown-menu'}) | |
links = [] | |
for dropdown in dropdowns: | |
for link in dropdown.find_all('a'): | |
if link.get('href'): | |
links.append(link['href']) | |
# Remove duplicate links because downloading a big video twice | |
# would be annoying af | |
links = list(set(links)) | |
return links | |
def main(): | |
base_url = 'http://svs.gsfc.nasa.gov' | |
index_route = '/Gallery/GPM.html' | |
index_links = create_list_of_urls_to_scrape(base_url + index_route) | |
#index_links = pd.read_csv('index_links_scraped.csv') | |
for link in tqdm(index_links): | |
folder_name = werkzeug.secure_filename(link) | |
create_subdir(folder_name) | |
data_links = get_data_links_on_page(base_url + link) | |
os.chdir(folder_name) | |
for data_link in data_links: | |
wget.download(base_url + data_link) | |
time.sleep(2) # I'm not a robot at all | |
os.chdir('..') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment