Created
January 20, 2020 23:22
-
-
Save spenkk/6563b84be4c0268d65f7d2328910a036 to your computer and use it in GitHub Desktop.
DoD Scrape
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
import os | |
import string | |
import requests | |
import colorama | |
from colorama import Fore, Style | |
from bs4 import BeautifulSoup | |
def withDescription(): | |
for website_name in website_table_items: | |
web = website_name.contents[-1].strip() | |
print(website_name['href'], Fore.GREEN + '"' + web + '"' + Style.RESET_ALL) | |
urls = website_name['href'] + ', "' + web + '"' | |
with open('all.txt', 'a') as out: | |
out.write(urls + '\n') | |
def onlyDomains(): | |
with open("all.txt") as lines: | |
for i in lines: | |
url = i.split(",")[0] | |
domain = url.split("//")[-1].split("/")[0] | |
# print(Fore.GREEN + domain + Style.RESET_ALL) | |
with open ('domains-temp.txt', 'a') as temp: | |
temp.write(domain + '\n') | |
print(Fore.RED + '[*] Sorting and removing duplicates.' + Style.RESET_ALL) | |
lines_seen = set() | |
outfile = open('domains.txt', 'w') | |
for line in open('domains-temp.txt', 'r'): | |
if line not in lines_seen: | |
outfile.write(line) | |
lines_seen.add(line) | |
outfile.close() | |
os.remove('domains-temp.txt') | |
alphabet = ['0-9'] | |
for i in string.ascii_uppercase: | |
alphabet.append(i) | |
for char in alphabet: | |
page = requests.get('https://www.defense.gov/Resources/Military-Departments/A-Z-List/?page={}'.format(char)) | |
soup = BeautifulSoup(page.text, 'html.parser') | |
website_table = soup.find(class_='DGOVWebsitesLinks') | |
website_table_items = website_table.find_all('a', href=True) | |
withDescription() | |
print(Fore.RED + '[*] Saving only domains to domains.txt' + Style.RESET_ALL) | |
onlyDomains() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment