-
-
Save sakethramanujam/677c706da20c254f880f80ffef75888b to your computer and use it in GitHub Desktop.
import math | |
import os | |
import json | |
import pandas as pd | |
import requests | |
from tqdm import tqdm | |
import os | |
import json | |
import tempfile | |
from datetime import datetime as dt | |
STATS_URL = "https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&latest=true" | |
tempdir = tempfile.gettempdir() | |
tempfp = os.path.join(tempdir, "percy_metadata_state.json") | |
existing_file_name = "" | |
def state_exists(path: str = tempfp): | |
""" | |
Checks if the percy_metadata_state file exists | |
in temporary directory | |
""" | |
if not os.path.isfile(path): | |
return False | |
return True | |
def create_state(path: str = tempfp): | |
""" | |
Creates a new percy_metadata_state.json file | |
in temporary directory | |
""" | |
template = { | |
"last_updated": "", | |
"n_scraped": "" | |
} | |
with open(path, "w") as f: | |
f.write(json.dumps(template)) | |
def read_state(path: str = tempfp): | |
""" | |
Reads new percy_metadata_state.json file | |
in temporary directory and gives out number of | |
pages that have already been scraped. | |
""" | |
try: | |
file = open(path, 'r') | |
data = json.load(file) | |
return data | |
except Exception as e: | |
print(f"Exception occured: {e}") | |
def update_state(update_dict: dict, path: str = tempfp): | |
""" | |
Takes in a dictionary with | |
new number of lines and the timestamp and, | |
updates percy_metadata_state.json file. | |
""" | |
with open(path, "w") as f: | |
f.write(json.dumps(update_dict)) | |
def give_me_time(): | |
""" | |
Creates a timestamp with current system time | |
used in metadata updation and filenames | |
""" | |
return dt.strftime(dt.now(), '%Y-%m-%d-%H_%M_%S') | |
def _checkfromfile(filepath: str): | |
""" | |
When there is no state information is available, | |
the script asks for filepath of previously downloaded | |
metadata csv file and calculates number of pages | |
that have already been scraped and creates a state file. | |
""" | |
if not os.path.isfile(filepath): | |
print("Oopsie, you sure the file path is right?") | |
return None | |
else: | |
df = pd.read_csv(filepath) | |
n = math.ceil(len(df)/50) | |
create_state() | |
ts = give_me_time() | |
update_state({"last_updated": ts, | |
"n_scraped": n}) | |
return n | |
def n_pages(where: str = "url"): | |
""" | |
Finds total number of available pages from | |
the NASA/JPL website. | |
Calls the check from file method in case we need to know | |
the number of pages that have already been scraped | |
""" | |
try: | |
if where == "url": | |
r = requests.get(STATS_URL) | |
stats = r.json() | |
n = math.ceil(stats["total"]/50) | |
return n | |
elif where == "file": | |
global existing_file_name | |
existing_file_name = input( | |
"path(\)filename of previously downloaded metadata: ") | |
n = _checkfromfile(existing_file_name) | |
if not n: | |
n_pages(where="file") | |
return n | |
except Exception as e: | |
print(f"An Error Occured in Finding Number of pages: {e}") | |
def get_image_list(url: str): | |
""" | |
Makes a get request to the images url | |
and extracts image list from it's response. | |
""" | |
try: | |
r = requests.get(url) | |
image_list = r.json()["images"] | |
return image_list | |
except Exception as e: | |
print(e) | |
def download_metadata(n_pages: int, filename: str = None): | |
""" | |
Once we have an idea about the number of pages | |
that are to be downloaded, this method visits | |
each of those pages and gets image list from the | |
get_image_list method and creates a pandas dataframe, | |
i.e., a table and saves it to a file | |
""" | |
dfs = [] | |
n = n_pages | |
progress_bar = tqdm(range(n)) | |
for page_num in progress_bar: | |
progress_bar.set_description( | |
"Downloading metadata from page: %d" % page_num) | |
url = f"https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&num=50&page={page_num}&order=sol+desc&&&undefined" | |
if page_num > 1: | |
url = f"https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&num=50&page={page_num}&order=sol+desc&&&extended=" | |
il = get_image_list(url) | |
dfs.append(pd.json_normalize(il, sep="_")) | |
df = pd.concat(dfs) | |
fn = filename if filename else f"./{give_me_time()}_metadata.csv" | |
df.reset_index(drop=True).to_csv(fn, index=False) | |
print(f"Metadata has been downloaded to {fn}") | |
return fn | |
def download_update(n_scraped: int, current: int): | |
""" | |
Technically, an update checker. | |
Gets total number of pages on the website and | |
current number pages from the state file. | |
Compares and calculates the number of pages to be downloaded | |
and calls metadata_downloader method. | |
""" | |
if current > n_scraped: | |
update_dict = {"last_updated": give_me_time(), | |
"n_scraped": current} | |
update_state(update_dict=update_dict) | |
print("Seems, Percy has some new data for you!, now downloading...") | |
to_download = current-n_scraped | |
patch_name = download_metadata(n_pages=to_download) | |
merge_with_existing(patch_name=patch_name) | |
elif current == n_scraped: | |
print("There's nothing new to download at this time, come back later!") | |
def downloader(where: str = "url"): | |
""" | |
This is where everything happens. | |
Calls n_pages with url saying, | |
that it needs context of total number of pages | |
Checks for state, incase state doesn't exist, | |
asks if data has already been downloaded. | |
If metadata is downloaded, creates state from file information, | |
proceeds to download patch only and asks if it needs to merge. | |
Else, downloads the entire metadata | |
# TODO: | |
- Port this script to percy-image-downloader package. | |
- Make state information, logging more robust and clear. | |
""" | |
total = n_pages(where="url") | |
current = n_pages(where=where) | |
state = state_exists() | |
if state: | |
n_scraped = read_state()["n_scraped"] | |
download_update(n_scraped=n_scraped, current=total) | |
else: | |
inp = (input("have you previously downloaded the metadata? (y/n): ")).lower() | |
if inp == "y": | |
downloader(where="file") | |
elif inp == "n": | |
print("Downloading all of the metadata now!") | |
download_metadata(n_pages=total, filename="./full-metadata.csv") | |
def merge_with_existing(patch_name: str): | |
decision = str(input( | |
"Do you wish to merge this file with existing full metadata file?: ")).lower() | |
if decision == "y": | |
global existing_file_name | |
if not existing_file_name: | |
existing_file_name = input( | |
"path(\)filename of previously downloaded metadata: ") | |
dfs = [pd.read_csv(patch_name), pd.read_csv(existing_file_name)] | |
pd.concat(dfs).reset_index(drop=True).to_csv( | |
existing_file_name, index=False) | |
print( | |
"The changes in metadata have been merged with the existing full metadata file.") | |
else: | |
print("Thank you Earthling, Have a nice sol!") | |
if __name__ == "__main__": | |
downloader() |
datetime | |
requests | |
pandas | |
tqdm |
Hi! Thanks so much for this script, I have been using it to retrieve metadata from which I'm building some of my own browse pages at
http://lakdawalla.com/emily/images/m2020/
It works great.
Over time, it will take longer and longer to scrape every page. How could this script be modified to scrape only those pages that are new or have been modified since the previous time the script was run? (Pardon if this is an extremely naïve question; I've learned and forgotten the rudiments of several programming languages in my career and I don't know python at all.) The value in the field "date received" could be used as a guide, but I am not sure how much lag there is between the photos being received on Earth and posted to the raw website. The best thing to do might be to keep track somehow of the number of pages that were scraped in the last update, and to re-scrape the last of those plus any new ones when the script is run again.
Hello Emily! I'm feeling starstruck right now (Fanboy moment)! I'm glad that this script is of any help.
I came up with this rudimentary script to start off with. Your hypothesis that scraping gets longer over time is correct.
I actually caught a break and am working on an update to be published over the weekend.
It shall allow for downloading metadata in two forms
- latest only.
- all of it.
I am currently exploring strategies on how to maintain that track of pages scraped. I will post an update both here and on twitter/let you know when I do publish the script. I will look into using date_recieved
field and see if I can work around with it. Thanks for the suggestion.
[EDIT]
I have a question for you, would you like the latest only data be present in a separate file or should it be appended to the existing one?
I went back to modifying the script and just realised that to test this feature thoroughly, there needs to be at least one update on the down link.
I will keep this thread active
The script update is now live. Please give it a try and let me know if you have any feedback.
TLDR: I will comment the script and revise this gist. Ultimately this script will become a part of the percy-image-downloader package.
It works! Comments:
- I was initially confused by "path to file you downloaded previously" because I have not been moving the CSVs out of the folder they are downloaded into (probably a messy habit on my part) so all I needed to supply was the filename of the previously downloaded file. Maybe just edit this to "path\filename of previously downloaded metadata"
- Also I think it would be good to offer the users the option to go back to the previous question if they decide, on second thought, that they want to do a full scrape, you might offer that by saying "or type 'none' to download all metadata"
- The next thing you will need to deal with is the fact that sometimes they command a second downlink of the same file, if the first one had some issue, like a data gap or something. It would be super if you could compare the imageids from the newly downloaded metadata to those in the previously downloaded metadata to see if there are any duplicates, and write either to the screen or to a log or both a line like "WARNING! ImageID [imageID] has been replaced with a new file."
- I personally like the fact that the update only shows me new data; however, users may wish to append the new data to the original. You might offer that as an option or a command-line switch.
This is super, thank you for your work. I have various ideas for next steps -- I really really need to learn python!
Thank you! 😄 for your kind words and comprehensive feedback.
I was initially confused by "path to file you downloaded previously" because I have not been moving the CSVs out of the folder they are ...
- I have a bad habit of writing print/user info statements like the one I did. I shall use the line as suggested, or, please suggest any other message that you think makes more sense. I liked
"path\filename of previously downloaded metadata
Also I think it would be good to offer the users the option to go back to the previous question if they decide, on second thought, that they
- Sure, I will add an additional option to the script to allow for full scrape in case they want to.
The next thing you will need to deal with is the fact that sometimes they command a second down link of the same file
- Oh, I did not know this, I need to think of what to do in this case. Currently the script manages state by creating a file called
percy_metadata.json
in your temporary files directory. (visit/tmp
on your linux machine or presswindows+r
and enter%temp%
to see this file. If that file is deleted, the script will either ask for existing metadata file or will allow for a full scrape.) If I were to allow for something that allows to go back to a previous revision, I believe I should log every-time the user makes a request for an update (The script is already doing it by replacing what's already there, I think appending should do the work.), and in case they want to redo a particular patch, I might be able to provide something like a snapshot list and a list of options to repeat.
I personally like the fact that the update only shows me new data;
- Merging files is a linear time operation and would be simple if the script has knowledge of existing files, especially filenames. I will surely implement that switch in a couple of days to come.
[EDIT]
Also, I am re writing parts of this code to publish a python package that is currently allowing for downloading images by page number.
Your insights were helpful in bringing this script to a better shape. If you have some time to check that package out and post your comments/feature requests, it would be great!
If you were to ask me the order of things that would be implemented, I would say 1,2,4 and then 3. I will keep you posted.
🌠
there are amazing resources to learn the language. I will share a list soon if you want me to.
Hello again! I have slightly updated the script to allow for merging new downloaded patch with existing large file and added comments to each of the methods. Please let me know in case I need to write more granular comments.
I will be adding command line flags soon that will reduce the users interaction with script. (Arghhhhhh these course work!)
[EDIT]
I am currently feeling that this code is more sphagetti than it is organised, I will try to make it clean and readable at the earliest.
one can install python requirements by downloading the
.txt
file and typing inpip3 install -r requirements.txt
from the path that requirements file is downloaded.