-
-
Save aviraldg/b277f7147983f848352bb953517ffe10 to your computer and use it in GitHub Desktop.
Web Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
__author__ = 'Angelis Pseftis' | |
from bs4 import BeautifulSoup | |
import urllib | |
import urlparse | |
import argparse | |
from termcolor import colored | |
import subprocess | |
import whois | |
from terminaltables import AsciiTable | |
parser = argparse.ArgumentParser() | |
parser.add_argument('url', type=str) | |
parser.add_argument('--element', '-e', type=str, action='append') | |
args = parser.parse_args() | |
parsed_url = urlparse.urlparse(args.url) | |
r = urllib.urlopen(args.url).read() | |
def link_filter(link): | |
link = link[1] | |
parsed_link = urlparse.urlparse(link) | |
return parsed_link.netloc and parsed_link != parsed_url.netloc | |
soup = BeautifulSoup(r, "html.parser") | |
links = set() | |
links.update([(elem.name, elem.attrs.get('href')) for elem in soup.find_all(args.element or True, href=True)]) | |
links.update([(elem.name, elem.attrs.get('src')) for elem in soup.find_all(args.element or True, src=True)]) | |
links = sorted(filter(link_filter, links), key=lambda e: [e[0], e[1]]) | |
links.insert(0, ['Type', 'Link']) | |
# print(colored('\n'.join([' '.join(link) for link in links]), 'blue')) | |
table = AsciiTable(links, 'External Links') | |
print(table.table) | |
# print('\n'.join([' '.join(link) for link in links])) | |
print('\n \n ') | |
print(colored('-----------Above this line are all external links.----------', 'green')) | |
print('\n') | |
table = AsciiTable(links, 'Dig') | |
table.inner_heading_row_border = False | |
dig_data = [] | |
hosts = set([urlparse.urlparse(link[1]).netloc for link in links if link]) | |
print(hosts) | |
for host in hosts: | |
try: | |
dig_data.append(subprocess.check_output(['dig',"+multiline","+noall","+answer","ANY", host]).split()) | |
except: | |
pass | |
print(table.table) | |
print('\n \n ') | |
print(colored('-----------Above is the information returned from Dig.----------', 'green')) | |
print('\n') | |
hosts = list(hosts) | |
whois_data = [['Host', 'Expiration Date', 'Status']] | |
while hosts: | |
host, hosts = hosts[0], hosts[1:] | |
try: | |
result = whois.whois(host) | |
d = colored(result.domain_name or host, 'red') | |
if not isinstance(d, basestring): | |
d = '\n'.join(map(lambda x: colored(x, 'red'), d)) | |
e = result.expiration_date | |
if not e: | |
e = [] | |
if not isinstance(e, list): | |
e = [e] | |
e = '\n'.join([colored(i, 'green') if isinstance(i, basestring) else colored(i.strftime('%d-%m-%y'), 'green') for i in e]) | |
s = result.status | |
if not s: | |
s = [] | |
if not isinstance(s, list): | |
s = [s] | |
s = '\n'.join([colored(x.split()[0] or 'unknown', 'blue') for x in s]) | |
if not isinstance(s, basestring): | |
s = '\n'.join(map(lambda x: colored(x.split()[0], 'blue'), s)) | |
whois_data.append([d, e, s]) | |
except whois.parser.PywhoisError as e: | |
# print(e) | |
print('skipping ' + host) | |
i = host.find('.') | |
if i == -1: | |
break | |
host = host[i+1:] | |
if host not in hosts: | |
hosts.append(host) | |
table = AsciiTable(whois_data, 'Whois') | |
# table.column_max_width(120) | |
print(table.table) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment