Skip to content

Instantly share code, notes, and snippets.

@erogol
Last active February 5, 2019 14:35
Show Gist options
  • Save erogol/b06864d0612e00eb6c93badb195973d2 to your computer and use it in GitHub Desktop.
Save erogol/b06864d0612e00eb6c93badb195973d2 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This code tries to pick sentences from a given corpus with an optimized phoneme coverage"
]
},
{
"cell_type": "code",
"execution_count": 228,
"metadata": {},
"outputs": [],
"source": [
"import argparse\n",
"import phonemizer\n",
"import re\n",
"import numpy as np\n",
"from tqdm import tqdm_notebook as tqdm\n",
"from phonemizer.phonemize import phonemize\n",
"from collections import Counter\n",
"\n",
"# Regular expression matchinf punctuations, ignoring empty space\n",
"_punctuations = '!\\'(),-.:;? '\n",
"pat = r'['+_punctuations[:-1]+']+'\n",
"\n",
"def text2phoneme(text, language):\n",
" '''\n",
" Convert graphemes to phonemes.\n",
" '''\n",
" seperator = phonemizer.separator.Separator(' |', '', '|',)\n",
" #try:\n",
" punctuations = re.findall(pat, text)\n",
" ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)\n",
"# print(ph)\n",
" # Replace \\n with matching punctuations.\n",
" if len(punctuations) > 0:\n",
" ph = ph.replace('\\n', '')\n",
" return ph\n",
"\n",
"\n",
"def load_csv(file_path):\n",
" \"\"\"Load csv file as in LJSpeech\"\"\"\n",
" items = []\n",
" with open(file_path, 'r') as ttf:\n",
" for line in ttf:\n",
" cols = line.split('|')\n",
" text = cols[1]\n",
" items.append(text)\n",
" return items\n",
"\n",
"\n",
"def ngrams(texts, n):\n",
" \"\"\"\n",
" Computes phoneme frequencies and phoneme to text index\n",
" \"\"\"\n",
" freqs = Counter()\n",
" inverted_index = {}\n",
" for idx, text in enumerate(tqdm(texts)):\n",
" ph = text2phoneme(text, 'en-us')\n",
" ph = ph.split(' |')\n",
" for ph_word in ph:\n",
" ph_word = ph_word.split('|')\n",
" ph_word = [p for p in ph_word if p is not \"\"]\n",
" for i in range(len(ph_word)-n+1):\n",
" g = ' '.join(ph_word[i:i+n])\n",
" freqs.setdefault(g, 0)\n",
" freqs[g] += 1\n",
" inverted_index.setdefault(g, [])\n",
" inverted_index[g] += [idx]\n",
" inverted_index[g] = list(set(inverted_index[g]))\n",
" return freqs, inverted_index\n",
"\n",
"\n",
"def create_index(inverted_index):\n",
" \"\"\"\n",
" Create index from text id to phonemes\n",
" \"\"\"\n",
" index = {}\n",
" for key in inverted_index.keys():\n",
" for text_idx in inverted_index[key]:\n",
" index.setdefault(text_idx, [])\n",
" index[text_idx] += [key]\n",
" return index\n",
"\n",
"\n",
"def score_texts(texts, index, inverted_index, freqs):\n",
" \"\"\"\n",
" Score each text with the sum of member phoneme scores normed by the text length.\n",
" So this scheme enforces the pick text samples with high phoneme coverage by\n",
" favoring shorter ones.\n",
" \"\"\"\n",
" scores = [None] * len(index)\n",
" for idx in range(len(index)):\n",
" phonemes = index[idx]\n",
" num_phonemes = len(phonemes)\n",
" score = 0\n",
" for phoneme in phonemes:\n",
" score += 1/freqs[phoneme]\n",
" score /= len(texts[idx])\n",
" scores[idx] = score\n",
" return scores\n",
"\n",
"\n",
"def compute_coverage(subset_idxs, index, freqs):\n",
" \"\"\"\n",
" Compute phoneme coverage ration for given samples in the corpus.\n",
" \"\"\"\n",
" phonemes = freqs.keys()\n",
" num_phonemes = len(phonemes)\n",
" subset_index = [index[idx] for idx in subset_idxs]\n",
" all_phonemes = []\n",
" for text_phonemes in subset_index:\n",
" all_phonemes += text_phonemes\n",
" all_phonemes = list(set(all_phonemes))\n",
" return len(all_phonemes) / len(freqs.keys())"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6dd9bed1d58c4282b231027f94cb1eda",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "99a3789e78764d0b9ad7208564c07e46",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e87184ad4afd47ffb74327bb302042ba",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"text_file = \"../Data/LJSpeech-1.1/metadata.csv\"\n",
"N = 3\n",
"\n",
"texts = load_csv(text_file)\n",
"freqs = {}\n",
"inverted_index = {}\n",
"for n in range(1, N+1):\n",
" f, ii = ngrams(texts, n)\n",
" freqs = {**freqs, **f}\n",
" inverted_index = {**inverted_index, **ii} \n"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [],
"source": [
"index = create_index(inverted_index)"
]
},
{
"cell_type": "code",
"execution_count": 219,
"metadata": {},
"outputs": [],
"source": [
"scores = score_texts(texts, index, inverted_index, freqs)\n",
"assert len(scores) == len(texts)"
]
},
{
"cell_type": "code",
"execution_count": 220,
"metadata": {},
"outputs": [],
"source": [
"# find top K sentecens\n",
"order_idxs = list(reversed(np.argsort(scores)))"
]
},
{
"cell_type": "code",
"execution_count": 245,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9323151683443249\n",
"0.6404026379729261\n"
]
}
],
"source": [
"K = 2000\n",
"# coverage with K selected sentences\n",
"print(compute_coverage(order_idxs[:K], index, freqs))\n",
"# coverage with K randomly selected sentences\n",
"print(compute_coverage(np.random.choice(range(len(texts)), size=K), index, freqs))"
]
},
{
"cell_type": "code",
"execution_count": 267,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"95.6055\n",
"7\n",
"170\n"
]
}
],
"source": [
"selected_text = [texts[idx] for idx in order_idxs[:K]]\n",
"text_lenghts = [len(text) for text in selected_text]\n",
"avg_text_len = np.mean(text_lenghts)\n",
"min_text_len = np.min(text_lenghts)\n",
"max_text_len = np.max(text_lenghts)\n",
"\n",
"print(avg_text_len)\n",
"print(min_text_len)\n",
"print(max_text_len)"
]
},
{
"cell_type": "code",
"execution_count": 271,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<matplotlib.lines.Line2D at 0x7f8bbe5f5080>]"
]
},
"execution_count": 271,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f8bbe894898>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"from matplotlib import pylab as plt\n",
"%matplotlib inline\n",
"plt.title(\"Sentence Scores\")\n",
"plt.plot(sorted(scores)[::-1])"
]
},
{
"cell_type": "code",
"execution_count": 272,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<matplotlib.lines.Line2D at 0x7f8bbe58a390>]"
]
},
"execution_count": 272,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f8bbe603ba8>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.title(\"Sentence Lengths\")\n",
"plt.plot(sorted(text_lenghts)[::-1])"
]
},
{
"cell_type": "code",
"execution_count": 262,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Its impression shows the face of a beardless young man, intelligent and refined.\n",
"He went out next morning with the booty when the doors were re-opened, and attracted no attention.\n",
"There is some indication that he may relate to men more easily than to women in view of the more mature conceptualisation.\n",
"laughing and uproarious, utterly unmindful of the companionship of men upon whom lay the shadow of an impending shameful death.\n",
"or for detention pending court appearance or commitment to a child-caring or custodial institution such as a training school.\n",
"rudiments of eyes in cave fishes, hind limbs beneath the skin of whales, the vermiform appendix in man,\n",
"it is not a real gain, for the modern printer throws the gain away by putting inordinately wide spaces between his lines, which, probably,\n",
"Two days later, a local Republican leader called for a \"civilized nonpartisan\" welcome\n",
"and denied admission to the \"charity wards,\" which partook of all the benefits of bequests and donations to poor debtors.\n",
"He met his death with unshaken firmness, only entreating that a certain blue handkerchief,\n",
"In 1812, James Claudius Rich, the British Resident at Baghdad, made the first complete examination of the ruins.\n",
"and to pay overseers or instructors out of the county rates.\n",
"No judge ever condemned a man to be half-perished with cold by day, or half-suffocated with heat by night.\n",
"This punctuate missile wound, about two-fifths inch in diameter (1 centimeter) and located approximately 5 inches above the left knee,\n",
"leaving between them room for a four-horse chariot to turn.\n",
"Prison chaplains of experience and high repute, such as Messrs. Field, Clay, Kingsmill, Burt, and Osborne, also advocated it.\n",
"but I, the devout petitioner, the worshipper of the gods, built the moat, and made its wall of burned brick and bitumen mountain high.\n",
"Calcraft's salary was more than the proverbial \"thirteenpence halfpenny -- hangman's wages.\"\n",
"when a petition was presented against the return of Messrs. Adam Dundas and Fitzroy Kelly. Various witnesses, including Messrs. J. B. Dasent,\n",
"that of a prize-fighter named Donovan, tried the same day, and convicted of manslaughter.\n",
"Two cart-loads of faggots were piled about her, and after she had hung for half-an-hour the fire was kindled.\n",
"One of his sumpter-mules gave birth to a foal.\n",
"Then the doorkeepers, when they heard this, carried him at once before the Magistrates.\n",
"Under \"Pathological Diagnosis\" the cause of death was set forth as \"Gunshot wound, head.\"\n",
"Brown took several sheets, and then was detected by Brewer, a fellow-workman of superior grade,\n",
"McWatters was sure that he left the checkpoint on time\n",
"followed him over to Sweden, and arrested him at Helsingfors.\n",
"Noble exploits in Persia are ever highly honored and bring their authors to greatness.\n",
"and the beginner in biological study is surprised to find them described in textbooks of both botany and zoology.\n",
"I was startled by the sharp report or explosion,\n",
"They were made to wash and swab the ward, or they were shut out from the ward fireplace, and forbidden to pass a chalked line drawn on the floor,\n",
"The Fleet, which stood in Farringdon Street,\n",
"and later experience has fully proved the advantage of a judicious system of gratuities for labor;\n",
"But the weights of the carbon, hydrogen, oxygen, nitrogen, phosphorus, sulfur and other elementary bodies contained in the bean-plant\n",
"Weedon and Lecasser to twelve and six months respectively in Coldbath Fields.\n",
"He said that his Marine service in Okinawa and elsewhere had given him, quote, a chance to observe American imperialism, end quote.\n",
"In Dallas the rain had stopped, and by midmorning a gloomy overcast sky had given way to the bright sunshine that greeted the Presidential party\n",
"It was the intersection near Lamar Street, it was near Poydras and Lamar Street.\n",
"Mrs. Tarpey was almost immediately captured and put on her trial, but she was acquitted on the plea that she had acted under the coercion of her husband.\n",
"running at a dog-trot into London, and others swore that they plainly recognized him as the man seen soon afterwards in the lane.\n",
"This assignment was given to Agent James P. Hosty, Jr. of the Dallas office upon Fain's retirement.\n",
"After this second exploit, his praise was in all mouths.\n",
"The last execution at which he acted was that of Godwin, on the 25th May, 1874.\n",
"that of Phoebe Harris, who in 1788 was \"barbariously\" executed and burnt before Newgate for coining.\n",
"He played high, and spent his nights at the club, or in joyous and dissolute company.\n",
"He was so much in favor of short drops that his immediate successor, Marwood, stigmatized him as \"short-drop\" man.\n",
"or like Cannon the chimney-sweeper, who savagely killed the policeman.\n",
"but he came as a lad to London, and took service as a pot-boy to a publican.\n",
"He saw Mr. Briggs' watch-chain, and followed him instantly into the carriage, determined to have it at all costs.\n",
"Romanes's \"Darwin and After Darwin\", and Le Conte's \"Evolution.\"\n"
]
}
],
"source": [
"for idx in range(500, 550):\n",
" print(texts[order_idxs[idx]])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment