Skip to content

Instantly share code, notes, and snippets.

View premrajnarkhede's full-sized avatar

Premraj Narkhede premrajnarkhede

View GitHub Profile
import requests
def getHTML(url):
"""
This function takes url as input
and gives raw html and final url as output
"""
error = ""
try:
response = requests.get(url)
r.raise_for_status()
from bs4 import BeautifulSoup
def extract_meta(data):
"""
This function takes raw html data as input
and gives title, description, keywords as output
"""
soup = BeautifulSoup(data, 'html.parser')
meta = ""
title = soup.title.string
import lxml.html.clean
from collections import defaultdict
import re
import tldextract
def extract_body_information(data,url):
"""
This function takes raw html data and final url of response as input
and gives plain text, headings, social media accounts and internal links on the page as output
"""
#Using built in function in lxml to clean some javascript code, attributes