Skip to content

Instantly share code, notes, and snippets.

@seungwonpark
Last active November 29, 2021 15:33
Show Gist options
  • Save seungwonpark/78dd69730ecee631e16018228c83af89 to your computer and use it in GitHub Desktop.
Save seungwonpark/78dd69730ecee631e16018228c83af89 to your computer and use it in GitHub Desktop.
Split CSD (Children's Song Dataset)
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "683d01fc",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import glob\n",
"import tqdm\n",
"import random\n",
"import hashlib\n",
"import soundfile as sf\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "9cb040c8",
"metadata": {},
"outputs": [],
"source": [
"with open('english/csd_english_meta_rm_abc.txt', 'r', encoding='utf-8') as f:\n",
" lines = [line.strip().split('|') for line in f.readlines()]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0aae2596",
"metadata": {},
"outputs": [],
"source": [
"lengths = list()\n",
"\n",
"for line in lines:\n",
" path, text, _ = line\n",
" wavpath = os.path.join('english', path)\n",
" wav, sr = sf.read(wavpath)\n",
" lengths.append(len(wav) / sr)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "deabeda6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<matplotlib.lines.Line2D at 0x7f07696da550>]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.title('csd_english_meta_rm_abc.txt')\n",
"plt.xlabel('Accumulated number of data')\n",
"plt.ylabel('Length (s)')\n",
"plt.plot(sorted(lengths))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3ec9fb91",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1684/1684 [00:00<00:00, 2689.83it/s]\n"
]
}
],
"source": [
"train_list = list()\n",
"val_list = list()\n",
"test_list = list()\n",
"\n",
"for line in tqdm.tqdm(lines):\n",
" path, text, _ = line\n",
" path = path.replace('.wav', '-22k.wav')\n",
" wavpath = os.path.join('english', path)\n",
" wav, sr = sf.read(wavpath)\n",
" assert sr == 22050\n",
" length = len(wav) / sr\n",
" \n",
" entry = '%s|%s|%s' % (os.path.join('CSD', 'english', path), text, \"CSD\")\n",
" h = hashlib.md5(entry.encode()).hexdigest()\n",
" h = int(h, 16)\n",
" \n",
" if length > 10.0:\n",
" if h % 2 == 0:\n",
" val_list.append(entry)\n",
" else:\n",
" test_list.append(entry)\n",
" else:\n",
" if h % 16 == 0:\n",
" val_list.append(entry)\n",
" elif h % 16 == 1:\n",
" test_list.append(entry)\n",
" else:\n",
" train_list.append(entry)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1dd56461",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1435 131 118\n"
]
}
],
"source": [
"print(len(train_list), len(val_list), len(test_list))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c41e8b9a",
"metadata": {},
"outputs": [],
"source": [
"with open(os.path.join('english', 'csd_en_train_22k.txt'), 'w', encoding='utf-8') as f:\n",
" for line in train_list:\n",
" f.write('%s\\n' % line)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "24bce6fe",
"metadata": {},
"outputs": [],
"source": [
"with open(os.path.join('english', 'csd_en_val_22k.txt'), 'w', encoding='utf-8') as f:\n",
" for line in val_list:\n",
" f.write('%s\\n' % line)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "9994423e",
"metadata": {},
"outputs": [],
"source": [
"with open(os.path.join('english', 'csd_en_test_22k.txt'), 'w', encoding='utf-8') as f:\n",
" for line in test_list:\n",
" f.write('%s\\n' % line)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d34493c9",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment