/YouTube-Playlists-Subtitles.ipynb
Forked from pshapiro/YouTube-Playlists-Subtitles.ipynb
Created Oct 24, 2019
Download auto-generated subtitles from a YouTube playlist and do a term frequency analysis
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Word Frequencies from YouTube Playlists" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Library imports and function to extract videos from playlist" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from youtube_transcript_api import YouTubeTranscriptApi\n", | |
"from nltk.corpus import stopwords \n", | |
"from nltk.tokenize import word_tokenize \n", | |
"from bs4 import BeautifulSoup\n", | |
"import pandas as pd\n", | |
"import requests\n", | |
"import re\n", | |
"\n", | |
"video_list = []\n", | |
"def playlistUrls(list, url):\n", | |
" sourceCode = requests.get(url).text\n", | |
" soup = BeautifulSoup(sourceCode, 'html.parser')\n", | |
" domain = 'https://www.youtube.com'\n", | |
" for link in soup.find_all(\"a\", {\"dir\": \"ltr\"}):\n", | |
" href = link.get('href')\n", | |
" if href.startswith('/watch?'):\n", | |
" list.append(domain + href)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"User inputs playlist URL" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"playlist_url = \"https://www.youtube.com/playlist?list=PLbKcy9p3mh_HGUdL2u8mrNi_ZNaHgNoja\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Execute function to extract videos from playlist. Iterate over list and download auto-generate subtitles." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"nOJhzSEMf4I\n", | |
"4kp687KxgI4\n", | |
"X-npNde0c1I\n", | |
"PrMh2hSYxw4\n", | |
"vG_llu1Fzeg\n", | |
"ZOFAopeC4SM\n", | |
"okb-xGMvuCU\n", | |
"cGVqD9IUq80\n", | |
"2xdhqCs6TjE\n", | |
"5EBjscrTU7c\n", | |
"O4gnrx2Rh5U\n", | |
"ZOefXB0jflc\n", | |
"2Yywh-z2tk4\n", | |
"RdevkvzqlMM\n", | |
"Vw_e63HfsaY\n", | |
"_71WrcOFq_A\n", | |
"Y3v5gcV5Hu0\n", | |
"o2BPboxtJnw\n", | |
"yynPyQ_PDoM\n", | |
"BAupV8QH-Zg\n", | |
"qRY0i_SShBc\n", | |
"rK-jJAJfdfg\n", | |
"UppcaSJPPMo\n", | |
"QwjKWXxNFYE\n", | |
"14A4_YQJGG4\n", | |
"Uqsf8EQEoGc\n" | |
] | |
} | |
], | |
"source": [ | |
"subtitle_list = []\n", | |
"\n", | |
"playlistUrls(video_list, playlist_url)\n", | |
"\n", | |
"for x in video_list:\n", | |
" video_id = re.search('watch\\?v=([a-zA-Z0-9\\-\\_]*)', x).group(1)\n", | |
" print(video_id)\n", | |
" get_subs = YouTubeTranscriptApi.get_transcript(video_id)\n", | |
" subtitle_list.append(\" \".join([x[\"text\"] for x in get_subs]))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Combine all subtitle text together, remove stop word with NLTK, add NLTK words together and determine frequencies" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"all_text = \" \".join([x for x in subtitle_list])\n", | |
"stop_words = set(stopwords.words('english')) \n", | |
"word_tokens = word_tokenize(all_text) \n", | |
"filtered_sentence = [w for w in word_tokens if not w in stop_words] \n", | |
" \n", | |
"for w in word_tokens: \n", | |
" if w not in stop_words: \n", | |
" filtered_sentence.append(w) \n", | |
" \n", | |
"stop_words_removed = \" \".join([x for x in filtered_sentence])\n", | |
"\n", | |
"frequency = {}\n", | |
"text_string = stop_words_removed.lower()\n", | |
"match_pattern = re.findall(r'\\b[a-z]{3,15}\\b', text_string)\n", | |
" \n", | |
"for word in match_pattern:\n", | |
" count = frequency.get(word,0)\n", | |
" frequency[word] = count + 1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Word frequency to dataframe and sort descending" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame(list(frequency.items()), columns=['Word', 'Frequency'])\n", | |
"df.sort_values(\"Frequency\", axis = 0, ascending = False, inplace = True, na_position ='last')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Print dataframe" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Word</th>\n", | |
" <th>Frequency</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>like</td>\n", | |
" <td>2200</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>55</th>\n", | |
" <td>people</td>\n", | |
" <td>1504</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>49</th>\n", | |
" <td>know</td>\n", | |
" <td>1428</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>172</th>\n", | |
" <td>really</td>\n", | |
" <td>1356</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>259</th>\n", | |
" <td>get</td>\n", | |
" <td>1182</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>112</th>\n", | |
" <td>right</td>\n", | |
" <td>1144</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>28</th>\n", | |
" <td>one</td>\n", | |
" <td>1052</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>843</th>\n", | |
" <td>want</td>\n", | |
" <td>1016</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>gon</td>\n", | |
" <td>918</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>43</th>\n", | |
" <td>things</td>\n", | |
" <td>804</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>86</th>\n", | |
" <td>google</td>\n", | |
" <td>776</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>168</th>\n", | |
" <td>content</td>\n", | |
" <td>730</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>32</th>\n", | |
" <td>think</td>\n", | |
" <td>716</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>33</th>\n", | |
" <td>actually</td>\n", | |
" <td>704</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>59</th>\n", | |
" <td>see</td>\n", | |
" <td>698</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>349</th>\n", | |
" <td>going</td>\n", | |
" <td>598</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>165</th>\n", | |
" <td>time</td>\n", | |
" <td>592</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>38</th>\n", | |
" <td>lot</td>\n", | |
" <td>588</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>369</th>\n", | |
" <td>something</td>\n", | |
" <td>570</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>145</th>\n", | |
" <td>well</td>\n", | |
" <td>566</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>611</th>\n", | |
" <td>make</td>\n", | |
" <td>532</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>646</th>\n", | |
" <td>use</td>\n", | |
" <td>528</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>281</th>\n", | |
" <td>kind</td>\n", | |
" <td>506</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>158</th>\n", | |
" <td>site</td>\n", | |
" <td>500</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>388</th>\n", | |
" <td>say</td>\n", | |
" <td>498</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>77</th>\n", | |
" <td>look</td>\n", | |
" <td>496</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>397</th>\n", | |
" <td>way</td>\n", | |
" <td>486</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>534</th>\n", | |
" <td>need</td>\n", | |
" <td>480</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>got</td>\n", | |
" <td>472</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>231</th>\n", | |
" <td>good</td>\n", | |
" <td>452</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2331</th>\n", | |
" <td>colored</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5039</th>\n", | |
" <td>wrapped</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2328</th>\n", | |
" <td>captured</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5041</th>\n", | |
" <td>duplication</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5042</th>\n", | |
" <td>exceptions</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5019</th>\n", | |
" <td>detract</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5018</th>\n", | |
" <td>amole</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5017</th>\n", | |
" <td>archived</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4996</th>\n", | |
" <td>alternatively</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2404</th>\n", | |
" <td>contextualize</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2403</th>\n", | |
" <td>lifecycle</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4986</th>\n", | |
" <td>amongst</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2396</th>\n", | |
" <td>demos</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4989</th>\n", | |
" <td>surefire</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4991</th>\n", | |
" <td>stein</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2394</th>\n", | |
" <td>entertain</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2391</th>\n", | |
" <td>reconfirms</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2390</th>\n", | |
" <td>shiny</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4997</th>\n", | |
" <td>outweigh</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2351</th>\n", | |
" <td>giller</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4999</th>\n", | |
" <td>variant</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2380</th>\n", | |
" <td>enlarge</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2370</th>\n", | |
" <td>invent</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5008</th>\n", | |
" <td>scrutiny</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5009</th>\n", | |
" <td>pvc</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2357</th>\n", | |
" <td>intermix</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5012</th>\n", | |
" <td>tabbed</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2355</th>\n", | |
" <td>mail</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2352</th>\n", | |
" <td>quacks</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7221</th>\n", | |
" <td>triskin</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>7222 rows × 2 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Word Frequency\n", | |
"11 like 2200\n", | |
"55 people 1504\n", | |
"49 know 1428\n", | |
"172 really 1356\n", | |
"259 get 1182\n", | |
"112 right 1144\n", | |
"28 one 1052\n", | |
"843 want 1016\n", | |
"22 gon 918\n", | |
"43 things 804\n", | |
"86 google 776\n", | |
"168 content 730\n", | |
"32 think 716\n", | |
"33 actually 704\n", | |
"59 see 698\n", | |
"349 going 598\n", | |
"165 time 592\n", | |
"38 lot 588\n", | |
"369 something 570\n", | |
"145 well 566\n", | |
"611 make 532\n", | |
"646 use 528\n", | |
"281 kind 506\n", | |
"158 site 500\n", | |
"388 say 498\n", | |
"77 look 496\n", | |
"397 way 486\n", | |
"534 need 480\n", | |
"7 got 472\n", | |
"231 good 452\n", | |
"... ... ...\n", | |
"2331 colored 2\n", | |
"5039 wrapped 2\n", | |
"2328 captured 2\n", | |
"5041 duplication 2\n", | |
"5042 exceptions 2\n", | |
"5019 detract 2\n", | |
"5018 amole 2\n", | |
"5017 archived 2\n", | |
"4996 alternatively 2\n", | |
"2404 contextualize 2\n", | |
"2403 lifecycle 2\n", | |
"4986 amongst 2\n", | |
"2396 demos 2\n", | |
"4989 surefire 2\n", | |
"4991 stein 2\n", | |
"2394 entertain 2\n", | |
"2391 reconfirms 2\n", | |
"2390 shiny 2\n", | |
"4997 outweigh 2\n", | |
"2351 giller 2\n", | |
"4999 variant 2\n", | |
"2380 enlarge 2\n", | |
"2370 invent 2\n", | |
"5008 scrutiny 2\n", | |
"5009 pvc 2\n", | |
"2357 intermix 2\n", | |
"5012 tabbed 2\n", | |
"2355 mail 2\n", | |
"2352 quacks 2\n", | |
"7221 triskin 2\n", | |
"\n", | |
"[7222 rows x 2 columns]" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment