Skip to content

Instantly share code, notes, and snippets.

@spenkk
Created January 20, 2020 23:22
Show Gist options
  • Save spenkk/6563b84be4c0268d65f7d2328910a036 to your computer and use it in GitHub Desktop.
Save spenkk/6563b84be4c0268d65f7d2328910a036 to your computer and use it in GitHub Desktop.
DoD Scrape
#!/usr/bin/env python
import re
import os
import string
import requests
import colorama
from colorama import Fore, Style
from bs4 import BeautifulSoup
def withDescription():
for website_name in website_table_items:
web = website_name.contents[-1].strip()
print(website_name['href'], Fore.GREEN + '"' + web + '"' + Style.RESET_ALL)
urls = website_name['href'] + ', "' + web + '"'
with open('all.txt', 'a') as out:
out.write(urls + '\n')
def onlyDomains():
with open("all.txt") as lines:
for i in lines:
url = i.split(",")[0]
domain = url.split("//")[-1].split("/")[0]
# print(Fore.GREEN + domain + Style.RESET_ALL)
with open ('domains-temp.txt', 'a') as temp:
temp.write(domain + '\n')
print(Fore.RED + '[*] Sorting and removing duplicates.' + Style.RESET_ALL)
lines_seen = set()
outfile = open('domains.txt', 'w')
for line in open('domains-temp.txt', 'r'):
if line not in lines_seen:
outfile.write(line)
lines_seen.add(line)
outfile.close()
os.remove('domains-temp.txt')
alphabet = ['0-9']
for i in string.ascii_uppercase:
alphabet.append(i)
for char in alphabet:
page = requests.get('https://www.defense.gov/Resources/Military-Departments/A-Z-List/?page={}'.format(char))
soup = BeautifulSoup(page.text, 'html.parser')
website_table = soup.find(class_='DGOVWebsitesLinks')
website_table_items = website_table.find_all('a', href=True)
withDescription()
print(Fore.RED + '[*] Saving only domains to domains.txt' + Style.RESET_ALL)
onlyDomains()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment