Skip to content

Instantly share code, notes, and snippets.

@sakethramanujam
Last active March 11, 2021 18:48
Show Gist options
  • Save sakethramanujam/677c706da20c254f880f80ffef75888b to your computer and use it in GitHub Desktop.
Save sakethramanujam/677c706da20c254f880f80ffef75888b to your computer and use it in GitHub Desktop.
Percy Metadata Downloader in Python
import math
import os
import json
import pandas as pd
import requests
from tqdm import tqdm
import os
import json
import tempfile
from datetime import datetime as dt
STATS_URL = "https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&latest=true"
tempdir = tempfile.gettempdir()
tempfp = os.path.join(tempdir, "percy_metadata_state.json")
existing_file_name = ""
def state_exists(path: str = tempfp):
"""
Checks if the percy_metadata_state file exists
in temporary directory
"""
if not os.path.isfile(path):
return False
return True
def create_state(path: str = tempfp):
"""
Creates a new percy_metadata_state.json file
in temporary directory
"""
template = {
"last_updated": "",
"n_scraped": ""
}
with open(path, "w") as f:
f.write(json.dumps(template))
def read_state(path: str = tempfp):
"""
Reads new percy_metadata_state.json file
in temporary directory and gives out number of
pages that have already been scraped.
"""
try:
file = open(path, 'r')
data = json.load(file)
return data
except Exception as e:
print(f"Exception occured: {e}")
def update_state(update_dict: dict, path: str = tempfp):
"""
Takes in a dictionary with
new number of lines and the timestamp and,
updates percy_metadata_state.json file.
"""
with open(path, "w") as f:
f.write(json.dumps(update_dict))
def give_me_time():
"""
Creates a timestamp with current system time
used in metadata updation and filenames
"""
return dt.strftime(dt.now(), '%Y-%m-%d-%H_%M_%S')
def _checkfromfile(filepath: str):
"""
When there is no state information is available,
the script asks for filepath of previously downloaded
metadata csv file and calculates number of pages
that have already been scraped and creates a state file.
"""
if not os.path.isfile(filepath):
print("Oopsie, you sure the file path is right?")
return None
else:
df = pd.read_csv(filepath)
n = math.ceil(len(df)/50)
create_state()
ts = give_me_time()
update_state({"last_updated": ts,
"n_scraped": n})
return n
def n_pages(where: str = "url"):
"""
Finds total number of available pages from
the NASA/JPL website.
Calls the check from file method in case we need to know
the number of pages that have already been scraped
"""
try:
if where == "url":
r = requests.get(STATS_URL)
stats = r.json()
n = math.ceil(stats["total"]/50)
return n
elif where == "file":
global existing_file_name
existing_file_name = input(
"path(\)filename of previously downloaded metadata: ")
n = _checkfromfile(existing_file_name)
if not n:
n_pages(where="file")
return n
except Exception as e:
print(f"An Error Occured in Finding Number of pages: {e}")
def get_image_list(url: str):
"""
Makes a get request to the images url
and extracts image list from it's response.
"""
try:
r = requests.get(url)
image_list = r.json()["images"]
return image_list
except Exception as e:
print(e)
def download_metadata(n_pages: int, filename: str = None):
"""
Once we have an idea about the number of pages
that are to be downloaded, this method visits
each of those pages and gets image list from the
get_image_list method and creates a pandas dataframe,
i.e., a table and saves it to a file
"""
dfs = []
n = n_pages
progress_bar = tqdm(range(n))
for page_num in progress_bar:
progress_bar.set_description(
"Downloading metadata from page: %d" % page_num)
url = f"https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&num=50&page={page_num}&order=sol+desc&&&undefined"
if page_num > 1:
url = f"https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&num=50&page={page_num}&order=sol+desc&&&extended="
il = get_image_list(url)
dfs.append(pd.json_normalize(il, sep="_"))
df = pd.concat(dfs)
fn = filename if filename else f"./{give_me_time()}_metadata.csv"
df.reset_index(drop=True).to_csv(fn, index=False)
print(f"Metadata has been downloaded to {fn}")
return fn
def download_update(n_scraped: int, current: int):
"""
Technically, an update checker.
Gets total number of pages on the website and
current number pages from the state file.
Compares and calculates the number of pages to be downloaded
and calls metadata_downloader method.
"""
if current > n_scraped:
update_dict = {"last_updated": give_me_time(),
"n_scraped": current}
update_state(update_dict=update_dict)
print("Seems, Percy has some new data for you!, now downloading...")
to_download = current-n_scraped
patch_name = download_metadata(n_pages=to_download)
merge_with_existing(patch_name=patch_name)
elif current == n_scraped:
print("There's nothing new to download at this time, come back later!")
def downloader(where: str = "url"):
"""
This is where everything happens.
Calls n_pages with url saying,
that it needs context of total number of pages
Checks for state, incase state doesn't exist,
asks if data has already been downloaded.
If metadata is downloaded, creates state from file information,
proceeds to download patch only and asks if it needs to merge.
Else, downloads the entire metadata
# TODO:
- Port this script to percy-image-downloader package.
- Make state information, logging more robust and clear.
"""
total = n_pages(where="url")
current = n_pages(where=where)
state = state_exists()
if state:
n_scraped = read_state()["n_scraped"]
download_update(n_scraped=n_scraped, current=total)
else:
inp = (input("have you previously downloaded the metadata? (y/n): ")).lower()
if inp == "y":
downloader(where="file")
elif inp == "n":
print("Downloading all of the metadata now!")
download_metadata(n_pages=total, filename="./full-metadata.csv")
def merge_with_existing(patch_name: str):
decision = str(input(
"Do you wish to merge this file with existing full metadata file?: ")).lower()
if decision == "y":
global existing_file_name
if not existing_file_name:
existing_file_name = input(
"path(\)filename of previously downloaded metadata: ")
dfs = [pd.read_csv(patch_name), pd.read_csv(existing_file_name)]
pd.concat(dfs).reset_index(drop=True).to_csv(
existing_file_name, index=False)
print(
"The changes in metadata have been merged with the existing full metadata file.")
else:
print("Thank you Earthling, Have a nice sol!")
if __name__ == "__main__":
downloader()
datetime
requests
pandas
tqdm
@sakethramanujam
Copy link
Author

sakethramanujam commented Mar 11, 2021

Hello again! I have slightly updated the script to allow for merging new downloaded patch with existing large file and added comments to each of the methods. Please let me know in case I need to write more granular comments.

I will be adding command line flags soon that will reduce the users interaction with script. (Arghhhhhh these course work!)

[EDIT]

I am currently feeling that this code is more sphagetti than it is organised, I will try to make it clean and readable at the earliest.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment