sakethramanujam/metadata-download.py

## metadata-download.py
import math
import os
import json
import pandas as pd
import requests
from tqdm import tqdm
import os
import json
import tempfile
from datetime import datetime as dt

STATS_URL = "https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&latest=true"

tempdir = tempfile.gettempdir()
tempfp = os.path.join(tempdir, "percy_metadata_state.json")
existing_file_name = ""


def state_exists(path: str = tempfp):
    """
    Checks if the percy_metadata_state file exists
    in temporary directory
    """
    if not os.path.isfile(path):
        return False
    return True


def create_state(path: str = tempfp):
    """
    Creates a new percy_metadata_state.json file
    in temporary directory
    """
    template = {
        "last_updated": "",
        "n_scraped": ""
    }
    with open(path, "w") as f:
        f.write(json.dumps(template))


def read_state(path: str = tempfp):
    """
    Reads new percy_metadata_state.json file
    in temporary directory and gives out number of
    pages that have already been scraped.
    """
    try:
        file = open(path, 'r')
        data = json.load(file)
        return data
    except Exception as e:
        print(f"Exception occured: {e}")


def update_state(update_dict: dict, path: str = tempfp):
    """
    Takes in a dictionary with
    new number of lines and the timestamp and,
    updates percy_metadata_state.json file.
    """
    with open(path, "w") as f:
        f.write(json.dumps(update_dict))


def give_me_time():
    """
    Creates a timestamp with current system time
    used in metadata updation and filenames
    """
    return dt.strftime(dt.now(), '%Y-%m-%d-%H_%M_%S')


def _checkfromfile(filepath: str):
    """
    When there is no state information is available,
    the script asks for filepath of previously downloaded
    metadata csv file and calculates number of pages
    that have already been scraped and creates a state file.
    """
    if not os.path.isfile(filepath):
        print("Oopsie, you sure the file path is right?")
        return None
    else:
        df = pd.read_csv(filepath)
        n = math.ceil(len(df)/50)
        create_state()
        ts = give_me_time()
        update_state({"last_updated": ts,
                      "n_scraped": n})
        return n


def n_pages(where: str = "url"):
    """
    Finds total number of available pages from
    the NASA/JPL website.
    Calls the check from file method in case we need to know
    the number of pages that have already been scraped
    """
    try:
        if where == "url":
            r = requests.get(STATS_URL)
            stats = r.json()
            n = math.ceil(stats["total"]/50)
            return n
        elif where == "file":
            global existing_file_name
            existing_file_name = input(
                "path(\)filename of previously downloaded metadata: ")
            n = _checkfromfile(existing_file_name)
            if not n:
                n_pages(where="file")
            return n
    except Exception as e:
        print(f"An Error Occured in Finding Number of pages: {e}")


def get_image_list(url: str):
    """
    Makes a get request to the images url
    and extracts image list from it's response.
    """
    try:
        r = requests.get(url)
        image_list = r.json()["images"]
        return image_list
    except Exception as e:
        print(e)


def download_metadata(n_pages: int, filename: str = None):
    """
    Once we have an idea about the number of pages
    that are to be downloaded, this method visits
    each of those pages and gets image list from the
    get_image_list method and creates a pandas dataframe,
    i.e., a table and saves it to a file
    """
    dfs = []
    n = n_pages
    progress_bar = tqdm(range(n))
    for page_num in progress_bar:
        progress_bar.set_description(
            "Downloading metadata from page: %d" % page_num)
        url = f"https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&num=50&page={page_num}&order=sol+desc&&&undefined"
        if page_num > 1:
            url = f"https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&num=50&page={page_num}&order=sol+desc&&&extended="
        il = get_image_list(url)
        dfs.append(pd.json_normalize(il, sep="_"))
    df = pd.concat(dfs)
    fn = filename if filename else f"./{give_me_time()}_metadata.csv"
    df.reset_index(drop=True).to_csv(fn, index=False)
    print(f"Metadata has been downloaded to {fn}")
    return fn


def download_update(n_scraped: int, current: int):
    """
    Technically, an update checker.
    Gets total number of pages on the website and
    current number pages from the state file.
    Compares and calculates the number of pages to be downloaded
    and calls metadata_downloader method.
    """
    if current > n_scraped:
        update_dict = {"last_updated": give_me_time(),
                       "n_scraped": current}
        update_state(update_dict=update_dict)
        print("Seems, Percy has some new data for you!, now downloading...")
        to_download = current-n_scraped
        patch_name = download_metadata(n_pages=to_download)
        merge_with_existing(patch_name=patch_name)
    elif current == n_scraped:
        print("There's nothing new to download at this time, come back later!")


def downloader(where: str = "url"):
    """
    This is where everything happens.

    Calls n_pages with url saying,
    that it needs context of total number of pages

    Checks for state, incase state doesn't exist,
    asks if data has already been downloaded.

    If metadata is downloaded, creates state from file information,
    proceeds to download patch only and asks if it needs to merge.

    Else, downloads the entire metadata

    # TODO:
    - Port this script to percy-image-downloader package.
    - Make state information, logging more robust and clear.

    """
    total = n_pages(where="url")
    current = n_pages(where=where)
    state = state_exists()
    if state:
        n_scraped = read_state()["n_scraped"]
        download_update(n_scraped=n_scraped, current=total)
    else:
        inp = (input("have you previously downloaded the metadata? (y/n): ")).lower()
        if inp == "y":
            downloader(where="file")
        elif inp == "n":
            print("Downloading all of the metadata now!")
            download_metadata(n_pages=total, filename="./full-metadata.csv")


def merge_with_existing(patch_name: str):
    decision = str(input(
        "Do you wish to merge this file with existing full metadata file?: ")).lower()
    if decision == "y":
        global existing_file_name
        if not existing_file_name:
            existing_file_name = input(
                "path(\)filename of previously downloaded metadata: ")
        dfs = [pd.read_csv(patch_name), pd.read_csv(existing_file_name)]
        pd.concat(dfs).reset_index(drop=True).to_csv(
            existing_file_name, index=False)
        print(
            "The changes in metadata have been merged with the existing full metadata file.")
    else:
        print("Thank you Earthling, Have a nice sol!")


if __name__ == "__main__":
    downloader()

## requirements.txt
datetime
requests
pandas
tqdm
	import math
	import os
	import json
	import pandas as pd
	import requests
	from tqdm import tqdm
	import os
	import json
	import tempfile
	from datetime import datetime as dt

	STATS_URL = "https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&latest=true"

	tempdir = tempfile.gettempdir()
	tempfp = os.path.join(tempdir, "percy_metadata_state.json")
	existing_file_name = ""


	def state_exists(path: str = tempfp):
	"""
	Checks if the percy_metadata_state file exists
	in temporary directory
	"""
	if not os.path.isfile(path):
	return False
	return True


	def create_state(path: str = tempfp):
	"""
	Creates a new percy_metadata_state.json file
	in temporary directory
	"""
	template = {
	"last_updated": "",
	"n_scraped": ""
	}
	with open(path, "w") as f:
	f.write(json.dumps(template))


	def read_state(path: str = tempfp):
	"""
	Reads new percy_metadata_state.json file
	in temporary directory and gives out number of
	pages that have already been scraped.
	"""
	try:
	file = open(path, 'r')
	data = json.load(file)
	return data
	except Exception as e:
	print(f"Exception occured: {e}")


	def update_state(update_dict: dict, path: str = tempfp):
	"""
	Takes in a dictionary with
	new number of lines and the timestamp and,
	updates percy_metadata_state.json file.
	"""
	with open(path, "w") as f:
	f.write(json.dumps(update_dict))


	def give_me_time():
	"""
	Creates a timestamp with current system time
	used in metadata updation and filenames
	"""
	return dt.strftime(dt.now(), '%Y-%m-%d-%H_%M_%S')


	def _checkfromfile(filepath: str):
	"""
	When there is no state information is available,
	the script asks for filepath of previously downloaded
	metadata csv file and calculates number of pages
	that have already been scraped and creates a state file.
	"""
	if not os.path.isfile(filepath):
	print("Oopsie, you sure the file path is right?")
	return None
	else:
	df = pd.read_csv(filepath)
	n = math.ceil(len(df)/50)
	create_state()
	ts = give_me_time()
	update_state({"last_updated": ts,
	"n_scraped": n})
	return n


	def n_pages(where: str = "url"):
	"""
	Finds total number of available pages from
	the NASA/JPL website.
	Calls the check from file method in case we need to know
	the number of pages that have already been scraped
	"""
	try:
	if where == "url":
	r = requests.get(STATS_URL)
	stats = r.json()
	n = math.ceil(stats["total"]/50)
	return n
	elif where == "file":
	global existing_file_name
	existing_file_name = input(
	"path(\)filename of previously downloaded metadata: ")
	n = _checkfromfile(existing_file_name)
	if not n:
	n_pages(where="file")
	return n
	except Exception as e:
	print(f"An Error Occured in Finding Number of pages: {e}")


	def get_image_list(url: str):
	"""
	Makes a get request to the images url
	and extracts image list from it's response.
	"""
	try:
	r = requests.get(url)
	image_list = r.json()["images"]
	return image_list
	except Exception as e:
	print(e)


	def download_metadata(n_pages: int, filename: str = None):
	"""
	Once we have an idea about the number of pages
	that are to be downloaded, this method visits
	each of those pages and gets image list from the
	get_image_list method and creates a pandas dataframe,
	i.e., a table and saves it to a file
	"""
	dfs = []
	n = n_pages
	progress_bar = tqdm(range(n))
	for page_num in progress_bar:
	progress_bar.set_description(
	"Downloading metadata from page: %d" % page_num)
	url = f"https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&num=50&page={page_num}&order=sol+desc&&&undefined"
	if page_num > 1:
	url = f"https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&num=50&page={page_num}&order=sol+desc&&&extended="
	il = get_image_list(url)
	dfs.append(pd.json_normalize(il, sep="_"))
	df = pd.concat(dfs)
	fn = filename if filename else f"./{give_me_time()}_metadata.csv"
	df.reset_index(drop=True).to_csv(fn, index=False)
	print(f"Metadata has been downloaded to {fn}")
	return fn


	def download_update(n_scraped: int, current: int):
	"""
	Technically, an update checker.
	Gets total number of pages on the website and
	current number pages from the state file.
	Compares and calculates the number of pages to be downloaded
	and calls metadata_downloader method.
	"""
	if current > n_scraped:
	update_dict = {"last_updated": give_me_time(),
	"n_scraped": current}
	update_state(update_dict=update_dict)
	print("Seems, Percy has some new data for you!, now downloading...")
	to_download = current-n_scraped
	patch_name = download_metadata(n_pages=to_download)
	merge_with_existing(patch_name=patch_name)
	elif current == n_scraped:
	print("There's nothing new to download at this time, come back later!")


	def downloader(where: str = "url"):
	"""
	This is where everything happens.

	Calls n_pages with url saying,
	that it needs context of total number of pages

	Checks for state, incase state doesn't exist,
	asks if data has already been downloaded.

	If metadata is downloaded, creates state from file information,
	proceeds to download patch only and asks if it needs to merge.

	Else, downloads the entire metadata

	# TODO:
	- Port this script to percy-image-downloader package.
	- Make state information, logging more robust and clear.

	"""
	total = n_pages(where="url")
	current = n_pages(where=where)
	state = state_exists()
	if state:
	n_scraped = read_state()["n_scraped"]
	download_update(n_scraped=n_scraped, current=total)
	else:
	inp = (input("have you previously downloaded the metadata? (y/n): ")).lower()
	if inp == "y":
	downloader(where="file")
	elif inp == "n":
	print("Downloading all of the metadata now!")
	download_metadata(n_pages=total, filename="./full-metadata.csv")


	def merge_with_existing(patch_name: str):
	decision = str(input(
	"Do you wish to merge this file with existing full metadata file?: ")).lower()
	if decision == "y":
	global existing_file_name
	if not existing_file_name:
	existing_file_name = input(
	"path(\)filename of previously downloaded metadata: ")
	dfs = [pd.read_csv(patch_name), pd.read_csv(existing_file_name)]
	pd.concat(dfs).reset_index(drop=True).to_csv(
	existing_file_name, index=False)
	print(
	"The changes in metadata have been merged with the existing full metadata file.")
	else:
	print("Thank you Earthling, Have a nice sol!")


	if __name__ == "__main__":
	downloader()