Created
June 2, 2012 19:08
-
-
Save andre-krueger/2859613 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3.2 | |
# -*- coding: utf-8 -*- | |
from sys import exit | |
from datetime import date | |
from bs4 import BeautifulSoup | |
from urllib.request import urlopen | |
from urllib.error import URLError | |
import argparse | |
class HTMLParser(object): | |
def __init__(self): | |
self.parser = argparse.ArgumentParser('Parses a website') | |
self.parser.add_argument('url', help='the website url') | |
self.parser.add_argument('-l', action='store_true', | |
help='search for links') | |
self.parser.add_argument('-i', action='store_true', | |
help='search for images') | |
self.parser.add_argument('-w', const='', | |
help='writes the output to html files', | |
nargs='?') | |
self.parser.add_argument('-show', action='store_true') | |
self.args = self.parser.parse_args() | |
def get_input(self): | |
try: | |
self.html_doc = urlopen(self.args.url) | |
except (ValueError, URLError): | |
print('unknown url type') | |
exit() | |
def parse_input(self): | |
self.soup = BeautifulSoup(self.html_doc) | |
self.links_container = [] | |
self.images_container = [] | |
if self.args.l: | |
for link in self.soup.find_all('a'): | |
self.links_container.append(link.get('href')) | |
elif self.args.i: | |
for image in self.soup.find_all('img'): | |
self.images_container.append(image) | |
def print_result(self): | |
if self.args.show and self.args.l: | |
print(self.links_container) | |
if self.args.show and self.args.i: | |
print(self.images_container) | |
def process_result(self): | |
self.timestamp = str(date.today()) | |
self.filename = '' | |
if self.args.w and self.args.i: | |
self.filename = '{0}{1}{2}'.format(self.args.w, self.timestamp, | |
'.html') | |
elif self.args.w and self.args.l: | |
self.filename = '{0}{1}{2}'.format(self.args.w, self.timestamp, | |
'.html') | |
elif self.args.w or self.args.i: | |
self.filename = 'images{0}.{1}'.format(self.timestamp, 'html') | |
elif self.args.w or self.args.l: | |
self.filename = 'links{0}.{1}'.format(self.timestamp, 'html') | |
if self.args.i and self.args.w is not None: | |
with open(self.filename, 'w') as f: | |
f.write(str(self.images_container)) | |
if self.args.l and self.args.w is not None: | |
with open(self.filename, 'w') as f: | |
f.write(str(self.links_container)) | |
def main(): | |
h = HTMLParser() | |
h.get_input() | |
h.parse_input() | |
h.print_result() | |
h.process_result() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment