Skip to content

Instantly share code, notes, and snippets.

@premrajnarkhede
Created January 22, 2020 07:00
Show Gist options
  • Save premrajnarkhede/b47561e62f3bba50c8f16fbc58165d13 to your computer and use it in GitHub Desktop.
Save premrajnarkhede/b47561e62f3bba50c8f16fbc58165d13 to your computer and use it in GitHub Desktop.
import lxml.html.clean
from collections import defaultdict
import re
import tldextract
def extract_body_information(data,url):
"""
This function takes raw html data and final url of response as input
and gives plain text, headings, social media accounts and internal links on the page as output
"""
#Using built in function in lxml to clean some javascript code, attributes
# and html irregularities
clean_html = lxml.html.clean.clean_html(data)
soup = BeautifulSoup(clean_html, 'html.parser')
# Extracting headings from clean html
headings = defaultdict(list)
for tag in ["h1","h2","h3","h4"]:
matches = soup.findAll(tag)
for match in matches:
text = match.text
headings[tag].append(text)
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', clean_html)
# Extracting domain from url to identify inner pointing links
td=tldextract.extract(url)
domain=td.registered_domain
sm_sites = ['twitter.com','facebook.com','linkedin.com']
soup = BeautifulSoup(data, 'html.parser')
# Creating soup of raw data for getting links
all_links = soup.find_all('a', href = True)
social_media_accounts = defaultdict(list)
links_to_follow = []
for sm_site in sm_sites:
for link in all_links:
# Links containing social media addresses are detected
if sm_site in link.attrs['href']:
sm_sites_present[sm_site].append(link.attrs['href'])
# Links containing domain of the page are detected for futher crawling
if domain in link.attrs['href']:
links_to_follow.append(link.attrs['href'])
return cleantext, headings, social_media_accounts,links_to_follow
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment