Skip to content

Instantly share code, notes, and snippets.

@snzisad
Last active October 4, 2020 18:39
Show Gist options
  • Save snzisad/adb8808b70850ab1872d88745994ef0d to your computer and use it in GitHub Desktop.
Save snzisad/adb8808b70850ab1872d88745994ef0d to your computer and use it in GitHub Desktop.
This snippet is used to retrieve data from https://heasarc.gsfc.nasa.gov/FTP/
!pip install validators
from bs4 import BeautifulSoup
import requests
import validators
def get_data(url):
soup = BeautifulSoup(requests.get(url).text)
url_data = []
for a in soup.find_all("a"): # retrieve all hyperlink tag
if a.has_attr('href'): # check there is any href attribute or not
url2 = a["href"] # get the hyperlink
data = {}
if validators.url(url2): # check it is a valid url or sub directory or file
data['label'] = "url"
else :
data['label'] = a.text
url2 = url+url2 # if sub directory, make a valid url
data['url'] = url2
url_data.append(data)
print(url_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment