redshiftzero/gpm.py

## gpm.py
import bs4 as BeautifulSoup
import os
import pandas as pd
import time
from tqdm import tqdm
from urllib.request import urlopen
import werkzeug
import wget


def create_list_of_urls_to_scrape(index_page):
    """Get all the links to URLs we want to grab the data from
    on the page http://svs.gsfc.nasa.gov/Gallery/GPM.html
    """

    index_html = urlopen(index_page)
    index = BeautifulSoup.BeautifulSoup(index_html, "lxml")
    data_sections = index.find_all('div', attrs={'class': 'section-item'})

    links = []
    for section_item in data_sections:
        for link in section_item.find_all('a'):
            # Split on the '#' character and take the part without anchors
            # because we'll be grabbing everything on the page anyway
            links.append(link['href'].split('#')[0])

    # Remove duplicate links
    links = list(set(links))

    df = pd.DataFrame({'Link': links})
    df.to_csv('index_links_scraped.csv')
    return links


def create_subdir(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)


def get_data_links_on_page(page_to_scrape):
    """Get the data links we want on the individual page,
    e.g. http://svs.gsfc.nasa.gov/4512
    """

    detail_html = urlopen(page_to_scrape)
    detail = BeautifulSoup.BeautifulSoup(detail_html, "lxml")
    dropdowns = detail.find_all('ul', attrs={'class': 'dropdown-menu'})

    links = []
    for dropdown in dropdowns:
        for link in dropdown.find_all('a'):
            if link.get('href'):
                links.append(link['href'])

    # Remove duplicate links because downloading a big video twice
    # would be annoying af
    links = list(set(links))
    return links


def main():
    base_url = 'http://svs.gsfc.nasa.gov'
    index_route = '/Gallery/GPM.html'

    index_links = create_list_of_urls_to_scrape(base_url + index_route)
    #index_links = pd.read_csv('index_links_scraped.csv')

    for link in tqdm(index_links):
        folder_name = werkzeug.secure_filename(link)
        create_subdir(folder_name)
        data_links = get_data_links_on_page(base_url + link)
        os.chdir(folder_name)
        for data_link in data_links:
            wget.download(base_url + data_link)
            time.sleep(2)  # I'm not a robot at all
        os.chdir('..')
	import bs4 as BeautifulSoup
	import os
	import pandas as pd
	import time
	from tqdm import tqdm
	from urllib.request import urlopen
	import werkzeug
	import wget


	def create_list_of_urls_to_scrape(index_page):
	"""Get all the links to URLs we want to grab the data from
	on the page http://svs.gsfc.nasa.gov/Gallery/GPM.html
	"""

	index_html = urlopen(index_page)
	index = BeautifulSoup.BeautifulSoup(index_html, "lxml")
	data_sections = index.find_all('div', attrs={'class': 'section-item'})

	links = []
	for section_item in data_sections:
	for link in section_item.find_all('a'):
	# Split on the '#' character and take the part without anchors
	# because we'll be grabbing everything on the page anyway
	links.append(link['href'].split('#')[0])

	# Remove duplicate links
	links = list(set(links))

	df = pd.DataFrame({'Link': links})
	df.to_csv('index_links_scraped.csv')
	return links


	def create_subdir(folder_name):
	if not os.path.exists(folder_name):
	os.makedirs(folder_name)


	def get_data_links_on_page(page_to_scrape):
	"""Get the data links we want on the individual page,
	e.g. http://svs.gsfc.nasa.gov/4512
	"""

	detail_html = urlopen(page_to_scrape)
	detail = BeautifulSoup.BeautifulSoup(detail_html, "lxml")
	dropdowns = detail.find_all('ul', attrs={'class': 'dropdown-menu'})

	links = []
	for dropdown in dropdowns:
	for link in dropdown.find_all('a'):
	if link.get('href'):
	links.append(link['href'])

	# Remove duplicate links because downloading a big video twice
	# would be annoying af
	links = list(set(links))
	return links


	def main():
	base_url = 'http://svs.gsfc.nasa.gov'
	index_route = '/Gallery/GPM.html'

	index_links = create_list_of_urls_to_scrape(base_url + index_route)
	#index_links = pd.read_csv('index_links_scraped.csv')

	for link in tqdm(index_links):
	folder_name = werkzeug.secure_filename(link)
	create_subdir(folder_name)
	data_links = get_data_links_on_page(base_url + link)
	os.chdir(folder_name)
	for data_link in data_links:
	wget.download(base_url + data_link)
	time.sleep(2) # I'm not a robot at all
	os.chdir('..')