JoJoseph25/parse_video_info.py

## parse_video_info.py
import requests
from requests.exceptions import ConnectionError
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import time

##  modified from https://github.com/sachinrai27/Youtube_video_detail_extraction ##
def parse_video_info(url):
    import time
    video_details={}

    nb_tries = 10
    while True:
        nb_tries -= 1
        try:
            # parsing the obtained url
            r=requests.get(url)
            break
        except ConnectionError as err:
            if nb_tries == 0:
                raise err
            else:
                time.sleep(1)
                return video_details

    # calling beautifulsoup instance to parse the url
    soup=bs(r.content,features="html.parser")

    #******extracting video title from webpage******
    try:
        title=(soup.find('meta', property="og:title")).attrs['content']
    except:
#         print("Video Data Unavailable")
        return video_details
    video_details["url"]=str(url)
    video_details["video_title"]=str(title)

    #******extracting tags******
    tags=soup.find_all('meta', property="og:video:tag")
    if len(str(tags))==2:
        video_details["hashtag"]="No tags present"
    else:
        #******looping over tags to get individual content of******
        video_details["hashtag"]=','.join([meta.attrs.get("content") for meta in soup.find_all("meta",{"property":"og:video:tag"})])

    #******extracting number of views******
    view = soup.find("meta", itemprop="interactionCount").attrs['content']
    video_details["views"]=int(view)

    #******extracting upload date******
    date=soup.find("meta", itemprop='uploadDate').attrs['content']
    video_details["upload_date"]=str(date)

    #******extracting category******
    genre=soup.find("meta", itemprop='genre').attrs['content']
    video_details["category"]=str(genre)

    #*****extracting channel title******
    channel_title=soup.find("link", itemprop='name').attrs['content']
    video_details["uploader_name"]=str(channel_title)

    #******using regular expression to extract number of subscribers******
    regex=re.compile(r'subscriberCountText.*subscribers')
    try:
        subs=(regex.search(str(soup))).group()
        video_details["subscribers"]=subs[subs.rindex('"')+1:].strip()
    except:
        video_details["subscribers"]="Nil Subscribers"

    #*******using regular expression to extract entire description******
    regex1=re.compile(r'shortDescription.*isCrawlable')
    desc=(regex1.search(str(soup))).group()
    desc=desc[desc.find('"',(desc.find('"'))+1)+1:desc.rindex('i')-3]
    if len(desc)==0:
        video_details["description"]="No description available"
    else:
        descr=(desc.replace('\\n', ' ')).replace('\\u0026','&')
        video_details["description"]=descr

    #*******using regular expression to extract number of likes/dislikes******
    regex2=re.compile(r'sentimentBarRenderer.*dateText')
    try:
        sentiment=(regex2.search(str(soup))).group()
        interact_list=sentiment[sentiment.find('tooltip')+10:sentiment.find('"}')]
        likes,dislike=map(int,interact_list.replace(',','').split('/'))
    except:
        likes=-1
        dislike=-1
    video_details["likes"]=likes
    video_details["dislike"]=dislike

    #******extracting duration of video******
    time=str(soup.find('meta', itemprop='duration').attrs['content'])
    minutes=time[time.find('T')+1:time.find('M')]
    seconds=time[time.find('M')+1:time.find('S')]
    if int(minutes)==0:
        video_details["duration"]=seconds+' seconds'
    else:
        total_time=minutes+' minutes '+seconds+' seconds'
        video_details["duration"]=total_time

    return video_details
	import requests
	from requests.exceptions import ConnectionError
	from bs4 import BeautifulSoup as bs
	import re
	import pandas as pd
	import time

	## modified from https://github.com/sachinrai27/Youtube_video_detail_extraction ##
	def parse_video_info(url):
	import time
	video_details={}

	nb_tries = 10
	while True:
	nb_tries -= 1
	try:
	# parsing the obtained url
	r=requests.get(url)
	break
	except ConnectionError as err:
	if nb_tries == 0:
	raise err
	else:
	time.sleep(1)
	return video_details

	# calling beautifulsoup instance to parse the url
	soup=bs(r.content,features="html.parser")

	#****extracting video title from webpage****
	try:
	title=(soup.find('meta', property="og:title")).attrs['content']
	except:
	# print("Video Data Unavailable")
	return video_details
	video_details["url"]=str(url)
	video_details["video_title"]=str(title)

	#****extracting tags****
	tags=soup.find_all('meta', property="og:video:tag")
	if len(str(tags))==2:
	video_details["hashtag"]="No tags present"
	else:
	#****looping over tags to get individual content of****
	video_details["hashtag"]=','.join([meta.attrs.get("content") for meta in soup.find_all("meta",{"property":"og:video:tag"})])

	#****extracting number of views****
	view = soup.find("meta", itemprop="interactionCount").attrs['content']
	video_details["views"]=int(view)

	#****extracting upload date****
	date=soup.find("meta", itemprop='uploadDate').attrs['content']
	video_details["upload_date"]=str(date)

	#****extracting category****
	genre=soup.find("meta", itemprop='genre').attrs['content']
	video_details["category"]=str(genre)

	#***extracting channel title****
	channel_title=soup.find("link", itemprop='name').attrs['content']
	video_details["uploader_name"]=str(channel_title)

	#****using regular expression to extract number of subscribers****
	regex=re.compile(r'subscriberCountText.*subscribers')
	try:
	subs=(regex.search(str(soup))).group()
	video_details["subscribers"]=subs[subs.rindex('"')+1:].strip()
	except:
	video_details["subscribers"]="Nil Subscribers"

	#*****using regular expression to extract entire description****
	regex1=re.compile(r'shortDescription.*isCrawlable')
	desc=(regex1.search(str(soup))).group()
	desc=desc[desc.find('"',(desc.find('"'))+1)+1:desc.rindex('i')-3]
	if len(desc)==0:
	video_details["description"]="No description available"
	else:
	descr=(desc.replace('\\n', ' ')).replace('\\u0026','&')
	video_details["description"]=descr

	#*****using regular expression to extract number of likes/dislikes****
	regex2=re.compile(r'sentimentBarRenderer.*dateText')
	try:
	sentiment=(regex2.search(str(soup))).group()
	interact_list=sentiment[sentiment.find('tooltip')+10:sentiment.find('"}')]
	likes,dislike=map(int,interact_list.replace(',','').split('/'))
	except:
	likes=-1
	dislike=-1
	video_details["likes"]=likes
	video_details["dislike"]=dislike

	#****extracting duration of video****
	time=str(soup.find('meta', itemprop='duration').attrs['content'])
	minutes=time[time.find('T')+1:time.find('M')]
	seconds=time[time.find('M')+1:time.find('S')]
	if int(minutes)==0:
	video_details["duration"]=seconds+' seconds'
	else:
	total_time=minutes+' minutes '+seconds+' seconds'
	video_details["duration"]=total_time

	return video_details