ruddra/python_data_parsing.py

## python_data_parsing.py
import sys
from urllib.parse import urlparse
import urllib.request
import re
import json
import datetime


class ContentParser(object):
    _url = None
    data = dict()
    _html = None

    def __init__(self, url):
        self._url = url

    def validate_url(self):
        rep = re.match(r'(https?://[^\s]+)', self._url)
        return rep

    def get_html(self):
        with urllib.request.urlopen(self._url) as response:
           _html = response.read()
           self._html = _html.decode('utf-8')

    def parse_urls_from_html(self):
        self.data['urls'] = ', '.join(re.findall(r'(https?://[^\s]+)', self._html))
        self.data['hyperlinks'] = ', '.join(re.findall(r'href=[\'"]?([^\'" >]+)', self._html))

    def parse_emails(self):
        #print(self._html)
        self.data['emails'] = ', '.join(re.findall(r'[\w\.-]+@[\w\.-]+', self._html))

    def parse_phone_numbers(self):
        #print(self._html)
        self.data['phone_numbers'] = ', '.join(re.findall(r'^(\d{3})-(\d{3})-(\d{4})$', self._html))

    def do_all(self):
        try:
            if self.validate_url():
                self.get_html()
                self.parse_emails()
                self.parse_phone_numbers()
                self.parse_urls_from_html()
                self.write_output()
            else:
                print('Invalid URL')
        except Exception as e:
            print (e)

    def write_output(self):
        with open('{0}.txt'.format(datetime.datetime.now().strftime('%d-%m-%Y:%H:%M:%S')), 'w') as outfile:
            report_string = 'Base Url is: {0} {1}{2}'.format(self._url, '\r\n', '\r\n')
            for key, value in self.data.items():
                report_string += '{0} : {1} {2}'.format(key, value, '\r\n')

            outfile.write(report_string)
            outfile.close()

    def show_data(self):
        return json.dumps(self.data)

for use_url in sys.argv[1:]:
    print('Base URL is %s' %use_url)
    con_par = ContentParser(use_url)
    con_par.do_all()
    print('Report has been generated')

# How to Use
# save this code in a python file i.e python_data_parsing.py
# now run the file with this command: python3 python_data_parsing.py http://example.com
# If it doesn't throw any error, a text file will be generated within the folder which contains python_data_parsing.py file
# Multiple urls are supported, seperated by space ' '
# For Example:
# python3 python_data_parsing.py http://example.com http://ruddra.com
# Cheers
	import sys
	from urllib.parse import urlparse
	import urllib.request
	import re
	import json
	import datetime


	class ContentParser(object):
	_url = None
	data = dict()
	_html = None

	def __init__(self, url):
	self._url = url

	def validate_url(self):
	rep = re.match(r'(https?://[^\s]+)', self._url)
	return rep

	def get_html(self):
	with urllib.request.urlopen(self._url) as response:
	_html = response.read()
	self._html = _html.decode('utf-8')

	def parse_urls_from_html(self):
	self.data['urls'] = ', '.join(re.findall(r'(https?://[^\s]+)', self._html))
	self.data['hyperlinks'] = ', '.join(re.findall(r'href=[\'"]?([^\'" >]+)', self._html))

	def parse_emails(self):
	#print(self._html)
	self.data['emails'] = ', '.join(re.findall(r'[\w\.-]+@[\w\.-]+', self._html))

	def parse_phone_numbers(self):
	#print(self._html)
	self.data['phone_numbers'] = ', '.join(re.findall(r'^(\d{3})-(\d{3})-(\d{4})$', self._html))

	def do_all(self):
	try:
	if self.validate_url():
	self.get_html()
	self.parse_emails()
	self.parse_phone_numbers()
	self.parse_urls_from_html()
	self.write_output()
	else:
	print('Invalid URL')
	except Exception as e:
	print (e)

	def write_output(self):
	with open('{0}.txt'.format(datetime.datetime.now().strftime('%d-%m-%Y:%H:%M:%S')), 'w') as outfile:
	report_string = 'Base Url is: {0} {1}{2}'.format(self._url, '\r\n', '\r\n')
	for key, value in self.data.items():
	report_string += '{0} : {1} {2}'.format(key, value, '\r\n')

	outfile.write(report_string)
	outfile.close()

	def show_data(self):
	return json.dumps(self.data)

	for use_url in sys.argv[1:]:
	print('Base URL is %s' %use_url)
	con_par = ContentParser(use_url)
	con_par.do_all()
	print('Report has been generated')

	# How to Use
	# save this code in a python file i.e python_data_parsing.py
	# now run the file with this command: python3 python_data_parsing.py http://example.com
	# If it doesn't throw any error, a text file will be generated within the folder which contains python_data_parsing.py file
	# Multiple urls are supported, seperated by space ' '
	# For Example:
	# python3 python_data_parsing.py http://example.com http://ruddra.com
	# Cheers