Last active
October 3, 2018 14:24
-
-
Save ruddra/658b44dd04956a9d8cdd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from urllib.parse import urlparse | |
import urllib.request | |
import re | |
import json | |
import datetime | |
class ContentParser(object): | |
_url = None | |
data = dict() | |
_html = None | |
def __init__(self, url): | |
self._url = url | |
def validate_url(self): | |
rep = re.match(r'(https?://[^\s]+)', self._url) | |
return rep | |
def get_html(self): | |
with urllib.request.urlopen(self._url) as response: | |
_html = response.read() | |
self._html = _html.decode('utf-8') | |
def parse_urls_from_html(self): | |
self.data['urls'] = ', '.join(re.findall(r'(https?://[^\s]+)', self._html)) | |
self.data['hyperlinks'] = ', '.join(re.findall(r'href=[\'"]?([^\'" >]+)', self._html)) | |
def parse_emails(self): | |
#print(self._html) | |
self.data['emails'] = ', '.join(re.findall(r'[\w\.-]+@[\w\.-]+', self._html)) | |
def parse_phone_numbers(self): | |
#print(self._html) | |
self.data['phone_numbers'] = ', '.join(re.findall(r'^(\d{3})-(\d{3})-(\d{4})$', self._html)) | |
def do_all(self): | |
try: | |
if self.validate_url(): | |
self.get_html() | |
self.parse_emails() | |
self.parse_phone_numbers() | |
self.parse_urls_from_html() | |
self.write_output() | |
else: | |
print('Invalid URL') | |
except Exception as e: | |
print (e) | |
def write_output(self): | |
with open('{0}.txt'.format(datetime.datetime.now().strftime('%d-%m-%Y:%H:%M:%S')), 'w') as outfile: | |
report_string = 'Base Url is: {0} {1}{2}'.format(self._url, '\r\n', '\r\n') | |
for key, value in self.data.items(): | |
report_string += '{0} : {1} {2}'.format(key, value, '\r\n') | |
outfile.write(report_string) | |
outfile.close() | |
def show_data(self): | |
return json.dumps(self.data) | |
for use_url in sys.argv[1:]: | |
print('Base URL is %s' %use_url) | |
con_par = ContentParser(use_url) | |
con_par.do_all() | |
print('Report has been generated') | |
# How to Use | |
# save this code in a python file i.e python_data_parsing.py | |
# now run the file with this command: python3 python_data_parsing.py http://example.com | |
# If it doesn't throw any error, a text file will be generated within the folder which contains python_data_parsing.py file | |
# Multiple urls are supported, seperated by space ' ' | |
# For Example: | |
# python3 python_data_parsing.py http://example.com http://ruddra.com | |
# Cheers |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment