Skip to content

Instantly share code, notes, and snippets.

@ruddra
Last active October 3, 2018 14:24
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ruddra/658b44dd04956a9d8cdd to your computer and use it in GitHub Desktop.
Save ruddra/658b44dd04956a9d8cdd to your computer and use it in GitHub Desktop.
import sys
from urllib.parse import urlparse
import urllib.request
import re
import json
import datetime
class ContentParser(object):
_url = None
data = dict()
_html = None
def __init__(self, url):
self._url = url
def validate_url(self):
rep = re.match(r'(https?://[^\s]+)', self._url)
return rep
def get_html(self):
with urllib.request.urlopen(self._url) as response:
_html = response.read()
self._html = _html.decode('utf-8')
def parse_urls_from_html(self):
self.data['urls'] = ', '.join(re.findall(r'(https?://[^\s]+)', self._html))
self.data['hyperlinks'] = ', '.join(re.findall(r'href=[\'"]?([^\'" >]+)', self._html))
def parse_emails(self):
#print(self._html)
self.data['emails'] = ', '.join(re.findall(r'[\w\.-]+@[\w\.-]+', self._html))
def parse_phone_numbers(self):
#print(self._html)
self.data['phone_numbers'] = ', '.join(re.findall(r'^(\d{3})-(\d{3})-(\d{4})$', self._html))
def do_all(self):
try:
if self.validate_url():
self.get_html()
self.parse_emails()
self.parse_phone_numbers()
self.parse_urls_from_html()
self.write_output()
else:
print('Invalid URL')
except Exception as e:
print (e)
def write_output(self):
with open('{0}.txt'.format(datetime.datetime.now().strftime('%d-%m-%Y:%H:%M:%S')), 'w') as outfile:
report_string = 'Base Url is: {0} {1}{2}'.format(self._url, '\r\n', '\r\n')
for key, value in self.data.items():
report_string += '{0} : {1} {2}'.format(key, value, '\r\n')
outfile.write(report_string)
outfile.close()
def show_data(self):
return json.dumps(self.data)
for use_url in sys.argv[1:]:
print('Base URL is %s' %use_url)
con_par = ContentParser(use_url)
con_par.do_all()
print('Report has been generated')
# How to Use
# save this code in a python file i.e python_data_parsing.py
# now run the file with this command: python3 python_data_parsing.py http://example.com
# If it doesn't throw any error, a text file will be generated within the folder which contains python_data_parsing.py file
# Multiple urls are supported, seperated by space ' '
# For Example:
# python3 python_data_parsing.py http://example.com http://ruddra.com
# Cheers
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment