Skip to content

Instantly share code, notes, and snippets.

@JoJoseph25
Last active June 28, 2021 04:15
Show Gist options
  • Save JoJoseph25/de470a2b9092c487229dc85ef07e410b to your computer and use it in GitHub Desktop.
Save JoJoseph25/de470a2b9092c487229dc85ef07e410b to your computer and use it in GitHub Desktop.
Fetch Youtube Video Detail from URL using BeatifulSoup
import requests
from requests.exceptions import ConnectionError
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import time
## modified from https://github.com/sachinrai27/Youtube_video_detail_extraction ##
def parse_video_info(url):
import time
video_details={}
nb_tries = 10
while True:
nb_tries -= 1
try:
# parsing the obtained url
r=requests.get(url)
break
except ConnectionError as err:
if nb_tries == 0:
raise err
else:
time.sleep(1)
return video_details
# calling beautifulsoup instance to parse the url
soup=bs(r.content,features="html.parser")
#******extracting video title from webpage******
try:
title=(soup.find('meta', property="og:title")).attrs['content']
except:
# print("Video Data Unavailable")
return video_details
video_details["url"]=str(url)
video_details["video_title"]=str(title)
#******extracting tags******
tags=soup.find_all('meta', property="og:video:tag")
if len(str(tags))==2:
video_details["hashtag"]="No tags present"
else:
#******looping over tags to get individual content of******
video_details["hashtag"]=','.join([meta.attrs.get("content") for meta in soup.find_all("meta",{"property":"og:video:tag"})])
#******extracting number of views******
view = soup.find("meta", itemprop="interactionCount").attrs['content']
video_details["views"]=int(view)
#******extracting upload date******
date=soup.find("meta", itemprop='uploadDate').attrs['content']
video_details["upload_date"]=str(date)
#******extracting category******
genre=soup.find("meta", itemprop='genre').attrs['content']
video_details["category"]=str(genre)
#*****extracting channel title******
channel_title=soup.find("link", itemprop='name').attrs['content']
video_details["uploader_name"]=str(channel_title)
#******using regular expression to extract number of subscribers******
regex=re.compile(r'subscriberCountText.*subscribers')
try:
subs=(regex.search(str(soup))).group()
video_details["subscribers"]=subs[subs.rindex('"')+1:].strip()
except:
video_details["subscribers"]="Nil Subscribers"
#*******using regular expression to extract entire description******
regex1=re.compile(r'shortDescription.*isCrawlable')
desc=(regex1.search(str(soup))).group()
desc=desc[desc.find('"',(desc.find('"'))+1)+1:desc.rindex('i')-3]
if len(desc)==0:
video_details["description"]="No description available"
else:
descr=(desc.replace('\\n', ' ')).replace('\\u0026','&')
video_details["description"]=descr
#*******using regular expression to extract number of likes/dislikes******
regex2=re.compile(r'sentimentBarRenderer.*dateText')
try:
sentiment=(regex2.search(str(soup))).group()
interact_list=sentiment[sentiment.find('tooltip')+10:sentiment.find('"}')]
likes,dislike=map(int,interact_list.replace(',','').split('/'))
except:
likes=-1
dislike=-1
video_details["likes"]=likes
video_details["dislike"]=dislike
#******extracting duration of video******
time=str(soup.find('meta', itemprop='duration').attrs['content'])
minutes=time[time.find('T')+1:time.find('M')]
seconds=time[time.find('M')+1:time.find('S')]
if int(minutes)==0:
video_details["duration"]=seconds+' seconds'
else:
total_time=minutes+' minutes '+seconds+' seconds'
video_details["duration"]=total_time
return video_details
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment