Skip to content

Instantly share code, notes, and snippets.

@nirinium
Last active January 12, 2019 16:14
Show Gist options
  • Save nirinium/11ec5bcc299bc0f41b080bb4e3166a67 to your computer and use it in GitHub Desktop.
Save nirinium/11ec5bcc299bc0f41b080bb4e3166a67 to your computer and use it in GitHub Desktop.
import os, sys, lxml, re, ssl, time
from bs4 import BeautifulSoup, SoupStrainer
from urllib.request import urlopen, urlretrieve, urljoin, URLError, HTTPError
def progressDL(count, blockSize, totalSize):
percent = int(count*blockSize*100/totalSize) #FORMULA TO CALCULATE DOWNLOAD PROGRESS
sys.stdout.write("\r" + "...%d%% " % percent) #PRINTS PROGRESS TO CONSOLE
sys.stdout.flush() #WRITES EVERYTHING TO TERMINAL
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
base_url = 'http://iasecontent.disa.mil/'
URL = 'https://iase.disa.mil/stigs/Pages/a-z.aspx'
OUTPUT_DIR = 'stigs'
html_page = urlopen(URL)
only_table_links = SoupStrainer("table")
soup = BeautifulSoup(html_page, 'lxml', parse_only=only_table_links)
linksList = []
def url_p_join():
if not href.startswith('h'):
href = urljoin(base_url, href)
print('join text')
for link in soup.find_all('a', attrs={'href': re.compile("(u_|U_)")}):
linksList.append(link.get('href') )
print('Total STIGs:', len(linksList) )
u = urlopen(URL)
try: #PAGE 1 of STIG site
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "lxml")
for link in soup.find_all('a', attrs={'href': re.compile("(u_|U_)")}):
href = link.get('href')
if not any(href.endswith(x) for x in linksList): #pulls links from [LIST] linksList
continue
else:
url_p_join
filename = os.path.join(OUTPUT_DIR, href.rsplit('/', 1)[-1])
print("> %s to \%s..." % (href, OUTPUT_DIR) )
relativeurls = []
try:
urlretrieve(href, filename, reporthook = progressDL)
except OSError as join:
print('')
except ValueError as error:
relativeurls.append(url_p_join)
print(ValueError)
except HTTPError as error:
print(HTTPError.filename, 'Error!')
except URLError as error:
print(URLError)
continue
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment