Created
October 14, 2022 07:02
-
-
Save jamescalam/17df40133d11c3c25aa9f4045c9d1145 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "05f2876d-9e4a-4271-aeb7-4ac8afad7c7e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 9072/9072 [00:07<00:00, 1156.11it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"from tqdm.auto import tqdm\n", | |
"\n", | |
"new_data = []\n", | |
"\n", | |
"window = 6 # number of sentences to combine\n", | |
"stride = 3 # number of sentences to 'stride' over, used to create overlap\n", | |
"\n", | |
"for i in tqdm(range(0, len(data), stride)):\n", | |
" i_end = min(len(data)-1, i+window)\n", | |
" if data[i]['title'] != data[i_end]['title']:\n", | |
" # in this case we skip this entry as we have start/end of two videos\n", | |
" continue\n", | |
" text = ' '.join(data[i:i_end]['text'])\n", | |
" new_data.append({\n", | |
" 'start': data[i]['start'],\n", | |
" 'end': data[i_end]['end'],\n", | |
" 'title': data[i]['title'],\n", | |
" 'text': text,\n", | |
" 'id': data[i]['id'],\n", | |
" 'url': data[i]['url'],\n", | |
" 'published': data[i]['published']\n", | |
" })" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "88c798ca-6aef-41a2-9979-056561ca693e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'start': 0.0,\n", | |
" 'end': 25.76,\n", | |
" 'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',\n", | |
" 'text': \"Hi, welcome to the video. So this is the fourth video in a Transformers from Scratch mini series. So if you haven't been following along, we've essentially covered what you can see on the screen. So we got some data.\",\n", | |
" 'id': '35Pdoyi6ZoQ-t0.0',\n", | |
" 'url': 'https://youtu.be/35Pdoyi6ZoQ',\n", | |
" 'published': '2021-07-06 13:00:03 UTC'}" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"new_data[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "b65d16ed-a84e-43a6-9798-9544248b6732", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'start': 981.4,\n", | |
" 'end': 1009.52,\n", | |
" 'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',\n", | |
" 'text': \"Now, it has taken a long time. It's a few days later. And I made a few changes during training as well. So this definitely wasn't the cleanest training process, because I was kind of updating parameters as it was going along. So initially, well, first, we've trained\",\n", | |
" 'id': '35Pdoyi6ZoQ-t981.4',\n", | |
" 'url': 'https://youtu.be/35Pdoyi6ZoQ',\n", | |
" 'published': '2021-07-06 13:00:03 UTC'}" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"new_data[100]" | |
] | |
} | |
], | |
"metadata": { | |
"environment": { | |
"kernel": "python3", | |
"name": "common-cu110.m95", | |
"type": "gcloud", | |
"uri": "gcr.io/deeplearning-platform-release/base-cu110:m95" | |
}, | |
"kernelspec": { | |
"display_name": "Python 3.9.12 ('ml')", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.12" | |
}, | |
"vscode": { | |
"interpreter": { | |
"hash": "b8e7999f96e1b425e2d542f21b571f5a4be3e97158b0b46ea1b2500df63956ce" | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment