Created
February 17, 2023 16:39
-
-
Save settwi/a5229ac467f2f18d4c0002be7a53918f to your computer and use it in GitHub Desktop.
Python script to scrape all the fits filels from a Hinode/XRT DARTS directory. I'm using it to get XRT data specifically but it could probably work with other instruments. Pass it the URL to scrape via command line arguments.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import parse | |
import os | |
import sys | |
verbose = True | |
def vp(*args, **kw): | |
if verbose: print(*args, **kw) | |
def main(): | |
for url in sys.argv[1:]: | |
file_names = scrape_hinode(url) | |
download(url, file_names) | |
def scrape_hinode(url: str) -> list[str]: | |
pg = requests.get(url) | |
txt = pg.content.decode('utf-8') | |
lines = txt.split('\n') | |
vp('start scrape urls') | |
parse_str = 'href="{da_name}.fits"' | |
ret = [] | |
for line in lines: | |
res = parse.findall(parse_str, line) | |
try: | |
res = next(res) | |
ret.append(res['da_name'] + '.fits') | |
except StopIteration: | |
pass | |
vp('done scrape urls') | |
return ret | |
def download(url: str, fns: list[str]) -> None: | |
# follow Hinode/XRT convention | |
out_dir = 'hinode-dat/' + url.split('hinode/')[1] | |
os.makedirs(out_dir, exist_ok=True) | |
chunk_size = int(2**16) | |
for fn in fns: | |
local = f'{out_dir}/{fn}' | |
vp('start', local) | |
with requests.get(url + fn, stream=True) as r: | |
r.raise_for_status() | |
with open(local, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=chunk_size): | |
f.write(chunk) | |
vp('done', local) | |
# that's a lot of indentation lol | |
if __name__ == '__main__': | |
# test_url = 'https://data.darts.isas.jaxa.jp/pub/hinode/xrt/level0/2022/11/11/H1100/' | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment