Skip to content

Instantly share code, notes, and snippets.

@pshapiro
Last active February 27, 2023 23:06
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save pshapiro/e9a2388a2d527c00fb66b57d429b9ed0 to your computer and use it in GitHub Desktop.
Save pshapiro/e9a2388a2d527c00fb66b57d429b9ed0 to your computer and use it in GitHub Desktop.
Download auto-generated subtitles from a YouTube playlist and do a term frequency analysis
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Word Frequencies from YouTube Playlists"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Library imports and function to extract videos from playlist"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from youtube_transcript_api import YouTubeTranscriptApi\n",
"from nltk.corpus import stopwords \n",
"from nltk.tokenize import word_tokenize \n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"import requests\n",
"import re\n",
"\n",
"video_list = []\n",
"def playlistUrls(list, url):\n",
" sourceCode = requests.get(url).text\n",
" soup = BeautifulSoup(sourceCode, 'html.parser')\n",
" domain = 'https://www.youtube.com'\n",
" for link in soup.find_all(\"a\", {\"dir\": \"ltr\"}):\n",
" href = link.get('href')\n",
" if href.startswith('/watch?'):\n",
" list.append(domain + href)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"User inputs playlist URL"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"playlist_url = \"https://www.youtube.com/playlist?list=PLbKcy9p3mh_HGUdL2u8mrNi_ZNaHgNoja\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Execute function to extract videos from playlist. Iterate over list and download auto-generate subtitles."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"nOJhzSEMf4I\n",
"4kp687KxgI4\n",
"X-npNde0c1I\n",
"PrMh2hSYxw4\n",
"vG_llu1Fzeg\n",
"ZOFAopeC4SM\n",
"okb-xGMvuCU\n",
"cGVqD9IUq80\n",
"2xdhqCs6TjE\n",
"5EBjscrTU7c\n",
"O4gnrx2Rh5U\n",
"ZOefXB0jflc\n",
"2Yywh-z2tk4\n",
"RdevkvzqlMM\n",
"Vw_e63HfsaY\n",
"_71WrcOFq_A\n",
"Y3v5gcV5Hu0\n",
"o2BPboxtJnw\n",
"yynPyQ_PDoM\n",
"BAupV8QH-Zg\n",
"qRY0i_SShBc\n",
"rK-jJAJfdfg\n",
"UppcaSJPPMo\n",
"QwjKWXxNFYE\n",
"14A4_YQJGG4\n",
"Uqsf8EQEoGc\n"
]
}
],
"source": [
"subtitle_list = []\n",
"\n",
"playlistUrls(video_list, playlist_url)\n",
"\n",
"for x in video_list:\n",
" video_id = re.search('watch\\?v=([a-zA-Z0-9\\-\\_]*)', x).group(1)\n",
" print(video_id)\n",
" get_subs = YouTubeTranscriptApi.get_transcript(video_id)\n",
" subtitle_list.append(\" \".join([x[\"text\"] for x in get_subs]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Combine all subtitle text together, remove stop word with NLTK, add NLTK words together and determine frequencies"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"all_text = \" \".join([x for x in subtitle_list])\n",
"stop_words = set(stopwords.words('english')) \n",
"word_tokens = word_tokenize(all_text) \n",
"filtered_sentence = [w for w in word_tokens if not w in stop_words] \n",
" \n",
"for w in word_tokens: \n",
" if w not in stop_words: \n",
" filtered_sentence.append(w) \n",
" \n",
"stop_words_removed = \" \".join([x for x in filtered_sentence])\n",
"\n",
"frequency = {}\n",
"text_string = stop_words_removed.lower()\n",
"match_pattern = re.findall(r'\\b[a-z]{3,15}\\b', text_string)\n",
" \n",
"for word in match_pattern:\n",
" count = frequency.get(word,0)\n",
" frequency[word] = count + 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Word frequency to dataframe and sort descending"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(list(frequency.items()), columns=['Word', 'Frequency'])\n",
"df.sort_values(\"Frequency\", axis = 0, ascending = False, inplace = True, na_position ='last')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Print dataframe"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Word</th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>like</td>\n",
" <td>2200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>55</th>\n",
" <td>people</td>\n",
" <td>1504</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td>know</td>\n",
" <td>1428</td>\n",
" </tr>\n",
" <tr>\n",
" <th>172</th>\n",
" <td>really</td>\n",
" <td>1356</td>\n",
" </tr>\n",
" <tr>\n",
" <th>259</th>\n",
" <td>get</td>\n",
" <td>1182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>112</th>\n",
" <td>right</td>\n",
" <td>1144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>one</td>\n",
" <td>1052</td>\n",
" </tr>\n",
" <tr>\n",
" <th>843</th>\n",
" <td>want</td>\n",
" <td>1016</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>gon</td>\n",
" <td>918</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43</th>\n",
" <td>things</td>\n",
" <td>804</td>\n",
" </tr>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>google</td>\n",
" <td>776</td>\n",
" </tr>\n",
" <tr>\n",
" <th>168</th>\n",
" <td>content</td>\n",
" <td>730</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>think</td>\n",
" <td>716</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>actually</td>\n",
" <td>704</td>\n",
" </tr>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>see</td>\n",
" <td>698</td>\n",
" </tr>\n",
" <tr>\n",
" <th>349</th>\n",
" <td>going</td>\n",
" <td>598</td>\n",
" </tr>\n",
" <tr>\n",
" <th>165</th>\n",
" <td>time</td>\n",
" <td>592</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>lot</td>\n",
" <td>588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>369</th>\n",
" <td>something</td>\n",
" <td>570</td>\n",
" </tr>\n",
" <tr>\n",
" <th>145</th>\n",
" <td>well</td>\n",
" <td>566</td>\n",
" </tr>\n",
" <tr>\n",
" <th>611</th>\n",
" <td>make</td>\n",
" <td>532</td>\n",
" </tr>\n",
" <tr>\n",
" <th>646</th>\n",
" <td>use</td>\n",
" <td>528</td>\n",
" </tr>\n",
" <tr>\n",
" <th>281</th>\n",
" <td>kind</td>\n",
" <td>506</td>\n",
" </tr>\n",
" <tr>\n",
" <th>158</th>\n",
" <td>site</td>\n",
" <td>500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>388</th>\n",
" <td>say</td>\n",
" <td>498</td>\n",
" </tr>\n",
" <tr>\n",
" <th>77</th>\n",
" <td>look</td>\n",
" <td>496</td>\n",
" </tr>\n",
" <tr>\n",
" <th>397</th>\n",
" <td>way</td>\n",
" <td>486</td>\n",
" </tr>\n",
" <tr>\n",
" <th>534</th>\n",
" <td>need</td>\n",
" <td>480</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>got</td>\n",
" <td>472</td>\n",
" </tr>\n",
" <tr>\n",
" <th>231</th>\n",
" <td>good</td>\n",
" <td>452</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2331</th>\n",
" <td>colored</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5039</th>\n",
" <td>wrapped</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2328</th>\n",
" <td>captured</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5041</th>\n",
" <td>duplication</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5042</th>\n",
" <td>exceptions</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5019</th>\n",
" <td>detract</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5018</th>\n",
" <td>amole</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5017</th>\n",
" <td>archived</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4996</th>\n",
" <td>alternatively</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2404</th>\n",
" <td>contextualize</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2403</th>\n",
" <td>lifecycle</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4986</th>\n",
" <td>amongst</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2396</th>\n",
" <td>demos</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4989</th>\n",
" <td>surefire</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4991</th>\n",
" <td>stein</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2394</th>\n",
" <td>entertain</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2391</th>\n",
" <td>reconfirms</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2390</th>\n",
" <td>shiny</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4997</th>\n",
" <td>outweigh</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2351</th>\n",
" <td>giller</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4999</th>\n",
" <td>variant</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2380</th>\n",
" <td>enlarge</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2370</th>\n",
" <td>invent</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5008</th>\n",
" <td>scrutiny</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5009</th>\n",
" <td>pvc</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2357</th>\n",
" <td>intermix</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5012</th>\n",
" <td>tabbed</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2355</th>\n",
" <td>mail</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2352</th>\n",
" <td>quacks</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7221</th>\n",
" <td>triskin</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>7222 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" Word Frequency\n",
"11 like 2200\n",
"55 people 1504\n",
"49 know 1428\n",
"172 really 1356\n",
"259 get 1182\n",
"112 right 1144\n",
"28 one 1052\n",
"843 want 1016\n",
"22 gon 918\n",
"43 things 804\n",
"86 google 776\n",
"168 content 730\n",
"32 think 716\n",
"33 actually 704\n",
"59 see 698\n",
"349 going 598\n",
"165 time 592\n",
"38 lot 588\n",
"369 something 570\n",
"145 well 566\n",
"611 make 532\n",
"646 use 528\n",
"281 kind 506\n",
"158 site 500\n",
"388 say 498\n",
"77 look 496\n",
"397 way 486\n",
"534 need 480\n",
"7 got 472\n",
"231 good 452\n",
"... ... ...\n",
"2331 colored 2\n",
"5039 wrapped 2\n",
"2328 captured 2\n",
"5041 duplication 2\n",
"5042 exceptions 2\n",
"5019 detract 2\n",
"5018 amole 2\n",
"5017 archived 2\n",
"4996 alternatively 2\n",
"2404 contextualize 2\n",
"2403 lifecycle 2\n",
"4986 amongst 2\n",
"2396 demos 2\n",
"4989 surefire 2\n",
"4991 stein 2\n",
"2394 entertain 2\n",
"2391 reconfirms 2\n",
"2390 shiny 2\n",
"4997 outweigh 2\n",
"2351 giller 2\n",
"4999 variant 2\n",
"2380 enlarge 2\n",
"2370 invent 2\n",
"5008 scrutiny 2\n",
"5009 pvc 2\n",
"2357 intermix 2\n",
"5012 tabbed 2\n",
"2355 mail 2\n",
"2352 quacks 2\n",
"7221 triskin 2\n",
"\n",
"[7222 rows x 2 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment