Skip to content

Instantly share code, notes, and snippets.

@aviraldg
Forked from anpseftis/crawler.py
Last active November 3, 2022 00:22
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aviraldg/b277f7147983f848352bb953517ffe10 to your computer and use it in GitHub Desktop.
Save aviraldg/b277f7147983f848352bb953517ffe10 to your computer and use it in GitHub Desktop.
Web Scraper
#!/usr/bin/python
__author__ = 'Angelis Pseftis'
from bs4 import BeautifulSoup
import urllib
import urlparse
import argparse
from termcolor import colored
import subprocess
import whois
from terminaltables import AsciiTable
parser = argparse.ArgumentParser()
parser.add_argument('url', type=str)
parser.add_argument('--element', '-e', type=str, action='append')
args = parser.parse_args()
parsed_url = urlparse.urlparse(args.url)
r = urllib.urlopen(args.url).read()
def link_filter(link):
link = link[1]
parsed_link = urlparse.urlparse(link)
return parsed_link.netloc and parsed_link != parsed_url.netloc
soup = BeautifulSoup(r, "html.parser")
links = set()
links.update([(elem.name, elem.attrs.get('href')) for elem in soup.find_all(args.element or True, href=True)])
links.update([(elem.name, elem.attrs.get('src')) for elem in soup.find_all(args.element or True, src=True)])
links = sorted(filter(link_filter, links), key=lambda e: [e[0], e[1]])
links.insert(0, ['Type', 'Link'])
# print(colored('\n'.join([' '.join(link) for link in links]), 'blue'))
table = AsciiTable(links, 'External Links')
print(table.table)
# print('\n'.join([' '.join(link) for link in links]))
print('\n \n ')
print(colored('-----------Above this line are all external links.----------', 'green'))
print('\n')
table = AsciiTable(links, 'Dig')
table.inner_heading_row_border = False
dig_data = []
hosts = set([urlparse.urlparse(link[1]).netloc for link in links if link])
print(hosts)
for host in hosts:
try:
dig_data.append(subprocess.check_output(['dig',"+multiline","+noall","+answer","ANY", host]).split())
except:
pass
print(table.table)
print('\n \n ')
print(colored('-----------Above is the information returned from Dig.----------', 'green'))
print('\n')
hosts = list(hosts)
whois_data = [['Host', 'Expiration Date', 'Status']]
while hosts:
host, hosts = hosts[0], hosts[1:]
try:
result = whois.whois(host)
d = colored(result.domain_name or host, 'red')
if not isinstance(d, basestring):
d = '\n'.join(map(lambda x: colored(x, 'red'), d))
e = result.expiration_date
if not e:
e = []
if not isinstance(e, list):
e = [e]
e = '\n'.join([colored(i, 'green') if isinstance(i, basestring) else colored(i.strftime('%d-%m-%y'), 'green') for i in e])
s = result.status
if not s:
s = []
if not isinstance(s, list):
s = [s]
s = '\n'.join([colored(x.split()[0] or 'unknown', 'blue') for x in s])
if not isinstance(s, basestring):
s = '\n'.join(map(lambda x: colored(x.split()[0], 'blue'), s))
whois_data.append([d, e, s])
except whois.parser.PywhoisError as e:
# print(e)
print('skipping ' + host)
i = host.find('.')
if i == -1:
break
host = host[i+1:]
if host not in hosts:
hosts.append(host)
table = AsciiTable(whois_data, 'Whois')
# table.column_max_width(120)
print(table.table)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment