andre-krueger/htmlparser.py

## htmlparser.py
#!/usr/bin/env python3.2
# -*- coding: utf-8 -*-

from sys import exit
from datetime import date
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.error import URLError
import argparse

class HTMLParser(object):

    def __init__(self):
        self.parser = argparse.ArgumentParser('Parses a website')
        self.parser.add_argument('url', help='the website url')
        self.parser.add_argument('-l', action='store_true',
                                 help='search for links')
        self.parser.add_argument('-i', action='store_true',
                                 help='search for images')
        self.parser.add_argument('-w', const='',
                                 help='writes the output to html files',
                                 nargs='?')
        self.parser.add_argument('-show', action='store_true')

        self.args = self.parser.parse_args()


    def get_input(self):
        try:
            self.html_doc = urlopen(self.args.url)
        except (ValueError, URLError):
            print('unknown url type')
            exit()


    def parse_input(self):
        self.soup = BeautifulSoup(self.html_doc)
        self.links_container = []
        self.images_container = []
        if self.args.l:
            for link in self.soup.find_all('a'):
                self.links_container.append(link.get('href'))
        elif self.args.i:
            for image in self.soup.find_all('img'):
                self.images_container.append(image)


    def print_result(self):
        if self.args.show and self.args.l:
            print(self.links_container)
        if self.args.show and self.args.i:
            print(self.images_container)


    def process_result(self):
        self.timestamp = str(date.today())
        self.filename = ''

        if self.args.w and self.args.i:
            self.filename = '{0}{1}{2}'.format(self.args.w, self.timestamp,
                                               '.html')
        elif self.args.w and self.args.l:
            self.filename = '{0}{1}{2}'.format(self.args.w, self.timestamp,
                                               '.html')
        elif self.args.w or self.args.i:
            self.filename = 'images{0}.{1}'.format(self.timestamp, 'html')
        elif self.args.w or self.args.l:
            self.filename = 'links{0}.{1}'.format(self.timestamp, 'html')

        if self.args.i and self.args.w is not None:
            with open(self.filename, 'w') as f:
                f.write(str(self.images_container))
        if self.args.l and self.args.w is not None:
            with open(self.filename, 'w') as f:
                f.write(str(self.links_container))


def main():
    h = HTMLParser()
    h.get_input()
    h.parse_input()
    h.print_result()
    h.process_result()

if __name__ == '__main__':
    main()
	#!/usr/bin/env python3.2
	# -- coding: utf-8 --

	from sys import exit
	from datetime import date
	from bs4 import BeautifulSoup
	from urllib.request import urlopen
	from urllib.error import URLError
	import argparse

	class HTMLParser(object):

	def __init__(self):
	self.parser = argparse.ArgumentParser('Parses a website')
	self.parser.add_argument('url', help='the website url')
	self.parser.add_argument('-l', action='store_true',
	help='search for links')
	self.parser.add_argument('-i', action='store_true',
	help='search for images')
	self.parser.add_argument('-w', const='',
	help='writes the output to html files',
	nargs='?')
	self.parser.add_argument('-show', action='store_true')

	self.args = self.parser.parse_args()



	def get_input(self):
	try:
	self.html_doc = urlopen(self.args.url)
	except (ValueError, URLError):
	print('unknown url type')
	exit()


	def parse_input(self):
	self.soup = BeautifulSoup(self.html_doc)
	self.links_container = []
	self.images_container = []
	if self.args.l:
	for link in self.soup.find_all('a'):
	self.links_container.append(link.get('href'))
	elif self.args.i:
	for image in self.soup.find_all('img'):
	self.images_container.append(image)


	def print_result(self):
	if self.args.show and self.args.l:
	print(self.links_container)
	if self.args.show and self.args.i:
	print(self.images_container)


	def process_result(self):
	self.timestamp = str(date.today())
	self.filename = ''

	if self.args.w and self.args.i:
	self.filename = '{0}{1}{2}'.format(self.args.w, self.timestamp,
	'.html')
	elif self.args.w and self.args.l:
	self.filename = '{0}{1}{2}'.format(self.args.w, self.timestamp,
	'.html')
	elif self.args.w or self.args.i:
	self.filename = 'images{0}.{1}'.format(self.timestamp, 'html')
	elif self.args.w or self.args.l:
	self.filename = 'links{0}.{1}'.format(self.timestamp, 'html')

	if self.args.i and self.args.w is not None:
	with open(self.filename, 'w') as f:
	f.write(str(self.images_container))
	if self.args.l and self.args.w is not None:
	with open(self.filename, 'w') as f:
	f.write(str(self.links_container))


	def main():
	h = HTMLParser()
	h.get_input()
	h.parse_input()
	h.print_result()
	h.process_result()

	if __name__ == '__main__':
	main()