Skip to content

Instantly share code, notes, and snippets.

@settwi
Created February 17, 2023 16:39
Show Gist options
  • Save settwi/a5229ac467f2f18d4c0002be7a53918f to your computer and use it in GitHub Desktop.
Save settwi/a5229ac467f2f18d4c0002be7a53918f to your computer and use it in GitHub Desktop.
Python script to scrape all the fits filels from a Hinode/XRT DARTS directory. I'm using it to get XRT data specifically but it could probably work with other instruments. Pass it the URL to scrape via command line arguments.
import requests
import parse
import os
import sys
verbose = True
def vp(*args, **kw):
if verbose: print(*args, **kw)
def main():
for url in sys.argv[1:]:
file_names = scrape_hinode(url)
download(url, file_names)
def scrape_hinode(url: str) -> list[str]:
pg = requests.get(url)
txt = pg.content.decode('utf-8')
lines = txt.split('\n')
vp('start scrape urls')
parse_str = 'href="{da_name}.fits"'
ret = []
for line in lines:
res = parse.findall(parse_str, line)
try:
res = next(res)
ret.append(res['da_name'] + '.fits')
except StopIteration:
pass
vp('done scrape urls')
return ret
def download(url: str, fns: list[str]) -> None:
# follow Hinode/XRT convention
out_dir = 'hinode-dat/' + url.split('hinode/')[1]
os.makedirs(out_dir, exist_ok=True)
chunk_size = int(2**16)
for fn in fns:
local = f'{out_dir}/{fn}'
vp('start', local)
with requests.get(url + fn, stream=True) as r:
r.raise_for_status()
with open(local, 'wb') as f:
for chunk in r.iter_content(chunk_size=chunk_size):
f.write(chunk)
vp('done', local)
# that's a lot of indentation lol
if __name__ == '__main__':
# test_url = 'https://data.darts.isas.jaxa.jp/pub/hinode/xrt/level0/2022/11/11/H1100/'
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment