Skip to content

Instantly share code, notes, and snippets.

@andre-krueger
Created June 2, 2012 19:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andre-krueger/2859613 to your computer and use it in GitHub Desktop.
Save andre-krueger/2859613 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3.2
# -*- coding: utf-8 -*-
from sys import exit
from datetime import date
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.error import URLError
import argparse
class HTMLParser(object):
def __init__(self):
self.parser = argparse.ArgumentParser('Parses a website')
self.parser.add_argument('url', help='the website url')
self.parser.add_argument('-l', action='store_true',
help='search for links')
self.parser.add_argument('-i', action='store_true',
help='search for images')
self.parser.add_argument('-w', const='',
help='writes the output to html files',
nargs='?')
self.parser.add_argument('-show', action='store_true')
self.args = self.parser.parse_args()
def get_input(self):
try:
self.html_doc = urlopen(self.args.url)
except (ValueError, URLError):
print('unknown url type')
exit()
def parse_input(self):
self.soup = BeautifulSoup(self.html_doc)
self.links_container = []
self.images_container = []
if self.args.l:
for link in self.soup.find_all('a'):
self.links_container.append(link.get('href'))
elif self.args.i:
for image in self.soup.find_all('img'):
self.images_container.append(image)
def print_result(self):
if self.args.show and self.args.l:
print(self.links_container)
if self.args.show and self.args.i:
print(self.images_container)
def process_result(self):
self.timestamp = str(date.today())
self.filename = ''
if self.args.w and self.args.i:
self.filename = '{0}{1}{2}'.format(self.args.w, self.timestamp,
'.html')
elif self.args.w and self.args.l:
self.filename = '{0}{1}{2}'.format(self.args.w, self.timestamp,
'.html')
elif self.args.w or self.args.i:
self.filename = 'images{0}.{1}'.format(self.timestamp, 'html')
elif self.args.w or self.args.l:
self.filename = 'links{0}.{1}'.format(self.timestamp, 'html')
if self.args.i and self.args.w is not None:
with open(self.filename, 'w') as f:
f.write(str(self.images_container))
if self.args.l and self.args.w is not None:
with open(self.filename, 'w') as f:
f.write(str(self.links_container))
def main():
h = HTMLParser()
h.get_input()
h.parse_input()
h.print_result()
h.process_result()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment