NotSoSuper/youtube_scraper.py

## youtube_scraper.py
from lxml import etree

#http://stackoverflow.com/a/34084933
def get_deep_text(element):
	try:
		text = element.text or ''
		for subelement in element:
		  text += get_deep_text(subelement)
		text += element.tail or ''
		return text
	except:
		return ''

async def youtube_scrap(self, search:str, safety=False):
	headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:43.0) Gecko/20100101 Firefox/43.0'}
	search = quote(search)
	api = 'https://www.youtube.com/results?search_query={0}'.format(search)
	entries = {}
	cookies = {'PREF': 'cvdm=grid&al=en&f4=4000000&f5=30&f1=50000000&f2=8000000'} if safety else None
	with aiohttp.ClientSession(cookies=cookies) as session:
		with aiohttp.Timeout(5):
			async with session.get(api, headers=headers) as r:
				assert r.status == 200
				txt = await r.text()
	root = etree.fromstring(txt, etree.HTMLParser())
	search_nodes = root.findall(".//ol[@class='section-list']/li/ol[@class='item-section']/li")
	search_nodes.pop(0)
	result = False
	for node in search_nodes:
		if result != False:
			break
		try:
			url_node = node.find('div/div/div/h3/a')
			if url_node is None:
				continue
			title = get_deep_text(url_node)
			url = 'https://www.youtube.com/{0}'.format(url_node.attrib['href'])
			result = [title, url]
		except:
			continue
	return result
	from lxml import etree

	#http://stackoverflow.com/a/34084933
	def get_deep_text(element):
	try:
	text = element.text or ''
	for subelement in element:
	text += get_deep_text(subelement)
	text += element.tail or ''
	return text
	except:
	return ''

	async def youtube_scrap(self, search:str, safety=False):
	headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:43.0) Gecko/20100101 Firefox/43.0'}
	search = quote(search)
	api = 'https://www.youtube.com/results?search_query={0}'.format(search)
	entries = {}
	cookies = {'PREF': 'cvdm=grid&al=en&f4=4000000&f5=30&f1=50000000&f2=8000000'} if safety else None
	with aiohttp.ClientSession(cookies=cookies) as session:
	with aiohttp.Timeout(5):
	async with session.get(api, headers=headers) as r:
	assert r.status == 200
	txt = await r.text()
	root = etree.fromstring(txt, etree.HTMLParser())
	search_nodes = root.findall(".//ol[@class='section-list']/li/ol[@class='item-section']/li")
	search_nodes.pop(0)
	result = False
	for node in search_nodes:
	if result != False:
	break
	try:
	url_node = node.find('div/div/div/h3/a')
	if url_node is None:
	continue
	title = get_deep_text(url_node)
	url = 'https://www.youtube.com/{0}'.format(url_node.attrib['href'])
	result = [title, url]
	except:
	continue
	return result