Skip to content

Instantly share code, notes, and snippets.

@psyaro
Created November 26, 2019 09:00
Show Gist options
  • Save psyaro/09e7a467e0a57b76f2d3067b1669a094 to your computer and use it in GitHub Desktop.
Save psyaro/09e7a467e0a57b76f2d3067b1669a094 to your computer and use it in GitHub Desktop.
youtube_subtitles_collecter
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from datetime import timedelta
import pandas as pd
def get():
options = Options()
options.add_argument('--headless')
options.add_argument('--window-size=1024,768')
with webdriver.Chrome(executable_path="xxxxxxxxxxxxxxxxx", chrome_options=options) as d:
d.get('https://www.youtube.com/watch?v=2rfVAGzihrg')
d.implicitly_wait(2)
x = d.find_element_by_xpath("//button[@aria-label='その他の操作']")
print(x.get_attribute('outerHTML'))
x.click()
time.sleep(0.5)
x = d.find_element_by_css_selector('ytd-menu-service-item-renderer:nth-child(2)').click()
time.sleep(0.5)
d.implicitly_wait(2)
x = d.page_source
return x
def main():
x = get()
s = BeautifulSoup(x, 'lxml')
ans = []
for x in s.select('.cue.style-scope.ytd-transcript-body-renderer'):
t = int(x.get('start-offset')) / 1000
dt = timedelta(seconds=t)
s = x.string.replace('\n ', '')[2:]
print(t, s)
ans.append([t, dt, s])
pd.DataFrame(ans).to_csv('temp.csv', index=None)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment