Skip to content

Instantly share code, notes, and snippets.

@NotSoSuper
Last active February 19, 2018 04:24
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save NotSoSuper/129af3c397cff142fad1ea9564b3533f to your computer and use it in GitHub Desktop.
Save NotSoSuper/129af3c397cff142fad1ea9564b3533f to your computer and use it in GitHub Desktop.
from lxml import etree
#http://stackoverflow.com/a/34084933
def get_deep_text(element):
try:
text = element.text or ''
for subelement in element:
text += get_deep_text(subelement)
text += element.tail or ''
return text
except:
return ''
async def youtube_scrap(self, search:str, safety=False):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:43.0) Gecko/20100101 Firefox/43.0'}
search = quote(search)
api = 'https://www.youtube.com/results?search_query={0}'.format(search)
entries = {}
cookies = {'PREF': 'cvdm=grid&al=en&f4=4000000&f5=30&f1=50000000&f2=8000000'} if safety else None
with aiohttp.ClientSession(cookies=cookies) as session:
with aiohttp.Timeout(5):
async with session.get(api, headers=headers) as r:
assert r.status == 200
txt = await r.text()
root = etree.fromstring(txt, etree.HTMLParser())
search_nodes = root.findall(".//ol[@class='section-list']/li/ol[@class='item-section']/li")
search_nodes.pop(0)
result = False
for node in search_nodes:
if result != False:
break
try:
url_node = node.find('div/div/div/h3/a')
if url_node is None:
continue
title = get_deep_text(url_node)
url = 'https://www.youtube.com/{0}'.format(url_node.attrib['href'])
result = [title, url]
except:
continue
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment