Premraj Narkhede premrajnarkhede

## getHTML.py
import requests
def getHTML(url):
    """
    This function takes url as input
    and gives raw html and final url as output
    """
    error = ""
    try:
        response = requests.get(url)
        r.raise_for_status()

## extract_meta.py
from bs4 import BeautifulSoup

def extract_meta(data):
    """
    This function takes raw html data as input
    and gives title, description, keywords as output
    """
    soup = BeautifulSoup(data, 'html.parser')
    meta = ""
    title = soup.title.string

## extract_body_information.py
import lxml.html.clean
from collections import defaultdict
import re
import tldextract
def extract_body_information(data,url):
    """
    This function takes raw html data and final url of response as input
    and gives plain text, headings, social media accounts and internal links on the page as output
    """
    #Using built in function in lxml to clean some javascript code, attributes
	import requests
	def getHTML(url):
	"""
	This function takes url as input
	and gives raw html and final url as output
	"""
	error = ""
	try:
	response = requests.get(url)
	r.raise_for_status()
	from bs4 import BeautifulSoup

	def extract_meta(data):
	"""
	This function takes raw html data as input
	and gives title, description, keywords as output
	"""
	soup = BeautifulSoup(data, 'html.parser')
	meta = ""
	title = soup.title.string
	import lxml.html.clean
	from collections import defaultdict
	import re
	import tldextract
	def extract_body_information(data,url):
	"""
	This function takes raw html data and final url of response as input
	and gives plain text, headings, social media accounts and internal links on the page as output
	"""
	#Using built in function in lxml to clean some javascript code, attributes