Skip to content

Instantly share code, notes, and snippets.

@jamescalam
Created October 14, 2022 07:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jamescalam/17df40133d11c3c25aa9f4045c9d1145 to your computer and use it in GitHub Desktop.
Save jamescalam/17df40133d11c3c25aa9f4045c9d1145 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"id": "05f2876d-9e4a-4271-aeb7-4ac8afad7c7e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 9072/9072 [00:07<00:00, 1156.11it/s]\n"
]
}
],
"source": [
"from tqdm.auto import tqdm\n",
"\n",
"new_data = []\n",
"\n",
"window = 6 # number of sentences to combine\n",
"stride = 3 # number of sentences to 'stride' over, used to create overlap\n",
"\n",
"for i in tqdm(range(0, len(data), stride)):\n",
" i_end = min(len(data)-1, i+window)\n",
" if data[i]['title'] != data[i_end]['title']:\n",
" # in this case we skip this entry as we have start/end of two videos\n",
" continue\n",
" text = ' '.join(data[i:i_end]['text'])\n",
" new_data.append({\n",
" 'start': data[i]['start'],\n",
" 'end': data[i_end]['end'],\n",
" 'title': data[i]['title'],\n",
" 'text': text,\n",
" 'id': data[i]['id'],\n",
" 'url': data[i]['url'],\n",
" 'published': data[i]['published']\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "88c798ca-6aef-41a2-9979-056561ca693e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'start': 0.0,\n",
" 'end': 25.76,\n",
" 'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',\n",
" 'text': \"Hi, welcome to the video. So this is the fourth video in a Transformers from Scratch mini series. So if you haven't been following along, we've essentially covered what you can see on the screen. So we got some data.\",\n",
" 'id': '35Pdoyi6ZoQ-t0.0',\n",
" 'url': 'https://youtu.be/35Pdoyi6ZoQ',\n",
" 'published': '2021-07-06 13:00:03 UTC'}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_data[0]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "b65d16ed-a84e-43a6-9798-9544248b6732",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'start': 981.4,\n",
" 'end': 1009.52,\n",
" 'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',\n",
" 'text': \"Now, it has taken a long time. It's a few days later. And I made a few changes during training as well. So this definitely wasn't the cleanest training process, because I was kind of updating parameters as it was going along. So initially, well, first, we've trained\",\n",
" 'id': '35Pdoyi6ZoQ-t981.4',\n",
" 'url': 'https://youtu.be/35Pdoyi6ZoQ',\n",
" 'published': '2021-07-06 13:00:03 UTC'}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_data[100]"
]
}
],
"metadata": {
"environment": {
"kernel": "python3",
"name": "common-cu110.m95",
"type": "gcloud",
"uri": "gcr.io/deeplearning-platform-release/base-cu110:m95"
},
"kernelspec": {
"display_name": "Python 3.9.12 ('ml')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"vscode": {
"interpreter": {
"hash": "b8e7999f96e1b425e2d542f21b571f5a4be3e97158b0b46ea1b2500df63956ce"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment