premrajnarkhede/extract_body_information.py

## extract_body_information.py
import lxml.html.clean
from collections import defaultdict
import re
import tldextract
def extract_body_information(data,url):
    """
    This function takes raw html data and final url of response as input
    and gives plain text, headings, social media accounts and internal links on the page as output
    """
    #Using built in function in lxml to clean some javascript code, attributes
    # and html irregularities
    clean_html = lxml.html.clean.clean_html(data)
    soup = BeautifulSoup(clean_html, 'html.parser')
    # Extracting headings from clean html
    headings = defaultdict(list)
    for tag in ["h1","h2","h3","h4"]:
        matches = soup.findAll(tag)
        for match in matches:
            text = match.text
            headings[tag].append(text)

    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', clean_html)
    # Extracting domain from url to identify inner pointing links
    td=tldextract.extract(url)
    domain=td.registered_domain
    sm_sites = ['twitter.com','facebook.com','linkedin.com']
    soup = BeautifulSoup(data, 'html.parser')
    # Creating soup of raw data for getting links
    all_links = soup.find_all('a', href = True)
    social_media_accounts = defaultdict(list)
    links_to_follow = []
    for sm_site in sm_sites:
        for link in all_links:
            # Links containing social media addresses are detected
            if sm_site in link.attrs['href']:
                sm_sites_present[sm_site].append(link.attrs['href'])
            # Links containing domain of the page are detected for futher crawling
            if domain in link.attrs['href']:
                links_to_follow.append(link.attrs['href'])
    return cleantext, headings, social_media_accounts,links_to_follow
	import lxml.html.clean
	from collections import defaultdict
	import re
	import tldextract
	def extract_body_information(data,url):
	"""
	This function takes raw html data and final url of response as input
	and gives plain text, headings, social media accounts and internal links on the page as output
	"""
	#Using built in function in lxml to clean some javascript code, attributes
	# and html irregularities
	clean_html = lxml.html.clean.clean_html(data)
	soup = BeautifulSoup(clean_html, 'html.parser')
	# Extracting headings from clean html
	headings = defaultdict(list)
	for tag in ["h1","h2","h3","h4"]:
	matches = soup.findAll(tag)
	for match in matches:
	text = match.text
	headings[tag].append(text)

	cleanr = re.compile('<.*?>')
	cleantext = re.sub(cleanr, '', clean_html)
	# Extracting domain from url to identify inner pointing links
	td=tldextract.extract(url)
	domain=td.registered_domain
	sm_sites = ['twitter.com','facebook.com','linkedin.com']
	soup = BeautifulSoup(data, 'html.parser')
	# Creating soup of raw data for getting links
	all_links = soup.find_all('a', href = True)
	social_media_accounts = defaultdict(list)
	links_to_follow = []
	for sm_site in sm_sites:
	for link in all_links:
	# Links containing social media addresses are detected
	if sm_site in link.attrs['href']:
	sm_sites_present[sm_site].append(link.attrs['href'])
	# Links containing domain of the page are detected for futher crawling
	if domain in link.attrs['href']:
	links_to_follow.append(link.attrs['href'])
	return cleantext, headings, social_media_accounts,links_to_follow