Skip to content

Instantly share code, notes, and snippets.

@CorentinJ
Last active March 10, 2024 11:20
Show Gist options
  • Save CorentinJ/0bc27814d93510ae8b6fe4516dc6981d to your computer and use it in GitHub Desktop.
Save CorentinJ/0bc27814d93510ae8b6fe4516dc6981d to your computer and use it in GitHub Desktop.
Word-level mappings for phonemizer
## Get the large corpus here if desired: https://puu.sh/InEMm.txt
## Save as "large_corpus.txt"
import sys
from functools import lru_cache
from itertools import groupby
from pathlib import Path
from typing import List
from phonemizer.backend import EspeakBackend
from tqdm import tqdm
from w2w_mapper import convert_with_word_level
_suprasegmentals = 'ˈˌːˑ'
_punctuation = '.!;:,?'
@lru_cache()
def _get_backend(language: str):
"""
Other backend and parameter combinations have not been tested, but I assume most would work.
"""
return EspeakBackend(
language=language,
preserve_punctuation=True,
with_stress=True,
language_switch="remove-flags",
words_mismatch="ignore",
)
@lru_cache(maxsize=1000)
def _text_to_phonemes(text: str, language="en-us"):
"""
This function wraps phonemize() and ensures that punctuation and spaces are as consistent as possible through
conversion.
"""
# Phonemize
backend = _get_backend(language)
outputs = backend.phonemize([text])
output = outputs[0] if len(outputs) else ""
# Correct leading/trailing spaces
if text[:1] == " " and output[:1] != " ":
output = " " + output
if text[:1] != " " and output[:1] == " ":
output = output[1:]
if text[-1:] == " " and output[-1:] != " ":
output = output + " "
if text[-1:] != " " and output[-1:] == " ":
output = output[:-1]
# Phonemizer may introduce spaces before punctuation, so we remove them.
j = 0
while j < len(output) - 1:
if output[j] == " " and output[j + 1] in _punctuation:
output = output[:j] + output[j + 1:]
j += 1
return output
def demo_word_mappings(texts: List[str], language: str, show_ouput=True):
# Define the equality and the conversion functions for the word-level mapper
remove_supresegmentals = lambda s: "".join(c for c in s if not c in _suprasegmentals)
eq_fn = lambda x, y: remove_supresegmentals(x) == remove_supresegmentals(y)
conv_fn = lambda t: _text_to_phonemes(t, language)
texts = texts if show_ouput else tqdm(texts, "Computing mappings...", len(texts), file=sys.stdout)
for text in texts:
# Get the word-level mapping
text_groups, phon, phon_groups = convert_with_word_level(text, conv_fn, eq_fn)
# Display them
if show_ouput:
phon_groups = [g or "" for g in phon_groups] # Replacing None entries with empty string
sizes = [max(len(g1), len(g2)) for g1, g2 in zip(text_groups, phon_groups)]
print("| " + " | ".join(g.ljust(s) for g, s in zip(text_groups, sizes)) + " |")
print("| " + " | ".join(g.ljust(s) for g, s in zip(phon_groups, sizes)) + " |")
print()
# These are English edge cases I encountered
edge_cases = [
# "Youtubers" gets split in two words, "on the" gets fusioned as one and "internet" is influenced by "the"
"YouTubers no longer belong on the internet.",
# "content" is either treated as the noun or as the adjective depending on the previous context
"He is quite content to die;",
# "I am going" get phonemized differently depending on context, plus the sentence is long.
"However if I am going to run it over all that code, which is going to be computationally expensive I may as well "
"try to get some use out of it.",
"this is a to be test",
"wear and tear",
"it on the",
# Gets fusioned as a single word
"wouldn't have to",
# Large number -> few characters in raw text that become many in symbols
# N.B.: with limited values for the word mapper search parameters, numbers large enough may fail the function.
# Proper text cleaning operations can alleviate this issue.
"He ate 14560 million cats",
"Almost 1,100 miles in one day.",
"lunchroom",
"for a while",
"too many",
"too much",
"each of",
"far more",
"few more",
"here and there",
"most of",
"such as",
"too few",
"something something for a while something something",
"something something for a while",
"for a while, something something",
# Some unicode chars
"✔",
" 🏠",
"😰 🤗 😁😁 😘 ",
# Sentences with words that will get entirely removed by phonemizer
# N.B.: although the algorithm supports these, such edge cases can be total headaches. A simple way of ensuring
# you don't have to deal with them is to clean the sentence of any word that contains no alphabetic characters
# (str.isalpha() includes alphabets from all languages).
"Alright ) ",
" ( Alright",
"\" Alright! \"",
"-- right",
"right --",
"that's -- right",
"that's -- -- right",
"that's . • _. •. . 4 right.",
"Hello ( world",
" Sure",
"Sure ",
]
# 190 normal English sentences
small_corpus = Path("small_corpus.txt").read_text().splitlines()
## Large multilangual corpus of 33,500 sentences (English, Dutch, French, German)
# large_corpus = [l.split("~") for l in Path("large_corpus.txt").read_text("utf-8").splitlines()]
# large_corpus = sorted(large_corpus, key=lambda t: t[0])
if __name__ == "__main__":
demo_word_mappings(edge_cases, "en-us")
demo_word_mappings(small_corpus, "en-us")
# for language, pairs in groupby(large_corpus, key=lambda t: t[0]):
# texts = list(zip(*pairs))[1]
# print(f"\nProcessing {len(texts)} sentences with language \"{language}\"")
# demo_word_mappings(texts, language, show_ouput=False)
#
It amused him to think that they were probably talking about him at that very moment.
I asked the woman behind the counter if they had any postcards.
Everything was covered with a thick layer of dust.
"My boi" is a great friend, but you want to sound cool or you don't want to say his name.
So you just yell that my boi, it also has become popular and many YouTubers have shirts that have that's my boi printed on it.
Catstep is a recent sub-genre branch off of filthstep, with a more aggressive style and typically softer notes.
People group some Monstercat songs into this genre, as there is "cat" in their name.
Corporations are shelling out huge bribes to politicians in the form of "campaign financing" however, calling a bribe by any other name is still a bribe.
On the funeral of the dead Soldier, his CO stood before the coffin and said: "Rest Easy, soldier" as a last salute.
Ah, jeez, 5 AM already? I'm outta here.
To make the challenge even more difficult, the writer could try to use the words in the exact order they were generated.
On October 19 my life changed forever... Nicole really did have a crush on me.
Why are people not more concerned about this in the west? It's becoming increasingly clear we need a self sustainable independant internet, free from government and corporate control...
And in today's day where most nations are industrialized but simply cannot stand against the might of those who came before them, they are in substandard conditions economically but industrialized.
But PlayStation, a console by that company who made your VCR and Discman, managed to get companies like Konami, Squaresoft and Namco to make exclusive games for them within the first year.
Something a lot of people don't consider is the controller was different and awesome. I think people liked how it felt compared to what else they could choose.
Wiz has always been all about his son.
Nobody for that matter.
Do alpacas always look like they're smiling? I had no idea!
He's so happy! I love the little "boop".
In Mumbai, people will soon be offered canned air.
There's some properly directed anger. Yes, focus it on the government for the mismanagement, which they're amazing at.
In the movie "The Big Short", the guy who predicted the 2008 housing market crash closed his hedge fund and began investing in water securities soon afterwards.
This is classic rent-seeking.
It's amazing what you can learn from these kinds of shenanigans.
Those ancient paintings on silk panels were not exaggerating the way things look there.
Oh it's just a word I made up whenever I'm in pain.
That claims process was difficult difficult lemon difficult.
Ah, are you two dating?
You're really thinking about leaving?
I didn't know we had two hour lunches!
The Elderly Japanese Man picks up the cone between thumb and forefinger.
A man possessed of some radical notions...
How could you get the carpet wrong?
Ariadne leans against the parapet, overlooking Paris.
She hands Cobb the pad, a touch pleased. Cobb solves the puzzle instantly, as before. Her smile falls.
A ramshackle balcony overlooking a bust street.
Browning puts his hand on Fischer's shoulder.
Ariadne is beside him. They stand on a cliff, dressed in white snowsuits, carrying white-painted weapons like World War 2 commandos.
He reaches into his pocket, takes out his pewter spinning top, lowers it to the table and spins it.
Up above? Listen to yourself. You judged me for believing the very same thing.
How long do we have?
I don't have enough time to listen to all of them.
Are you thick, or what?
Thousands of factory workers are facing redundancy.
When I was about halfway between twenty and thirty, I lived in a large, run-down house that other people thought was romantic.
Our silences really, not our voices, engaged in conversation. Though I hear you singing. Softly. Clearly.
We're not the types to go in and take something from old folks, not at all. We're upstanding end-of-the-era hippies who just need some help, that's all.
The corpus contains more than 560 million words of text and it is equally divided among spoken, fiction, popular magazines, newspapers, and academic texts.
ACT and SAT scores may be a useful indicator of how well they are doing so among the self-selected group of students that plan to attend college.
The Court was reluctant to insist that states rely on a particular metric when doing so would force a state to bake artificial distortions into its political structure.
There are a number of reasons for this relatively high level of LTV.
Another interesting observation is that on average club deals place management fees, while the performance fee is somewhat higher.
How do i adjust the converting options so it doesn't end up that big?
Jeremy Irons must have known he was in a dud, and decided to overact just to make it bearable.
Such a good character, great dad, husband, and investor.
I just watched that, again, that scene literally gave me a bit of a shocked thrill.
I guess I tend to think all these techniques used today are from today, it is pretty amazing to see them in the 50s.
Is it worth to watch the Twilight Zone?
I've messaged small presses via email and social media, still not hearing back from anyone.
Free myself from familiar circles where I no longer belong.
Escaping from the sirens' wail, avoiding crash upon the rocks.
I'm a published writer. Here is my advice: you need to go to book fares and meet editors face to face
Nowhere, because no one else has ever used it before.
I just meant that there are plenty of bad books that sell well to really well, so my stuff that is, if not great, better than theirs and has the potential to sell really, really well.
I don't see why we can't have both technical mastery and innovative expressions of unique experiences.
You probably think contemporary poets haven't already mastered and then dispensed with formalism in their unpublished volumes.
You lack the self-awareness to be good at this. Poetry requires introspection, you have none.
Having numb feeling in your feet is a sign of diabetes. Like how else could you wear two left footed shoes?
Where do mathematics come from?
He would surely have a sense of the sacred, as that appears to be an innate thing present in all cultures across time.
Yes, you and I may well disagree with his conclusions
What's your election night plan lads?
Despite Christopher Hitchens trying his whole life to get attention through shock value, he never said anything interesting.
After tonight's votes there will most probably be a hung parliament and you will keep on doing what you do best: arguing about Brexit.
It's like an endless cycle at this point.
A truly moral person is he who is able to act in such a way that his life path will be most preferable of all, and that, if he could have chosen all other paths, he would still prefer the life he lived.
Sorry, I don't own any non-stick cookware, and I'm not gonna start now.
You cannot make a McMuffin at home unless you're buying your buns from McDonald's. It's just an English muffin.
Why aren't you eating biscuits and gravy?
How do we global citizens convince more people of this infinitely diverse and colourful global villages to eat more bugs to fight the climate crisis that is set to destroy the world in less than 12 years?
Can I do anything useful with finely milled beans?
Unprecedented queues were seen at a number of polling stations this morning as wind and rain failed to deter voters from casting their ballot in the general election.
At the first December election since 1923, it was unclear whether the crowds pointed to a high turnout overall, or voters keen to avoid a post-work trip to the polling station in the dark.
Sanna Marin, the new Social Democrat prime minister, is leading a coalition dominated by women.
While many hail it as a historic day, the party's main opposition claims her leadership is only a fad, Charles Bremner writes.
Sir Andy Murray became so addicted to video games that it affected his performance on court.
Murray, the former Wimbledon men's singles tennis champion and winner of two other Grand Slam titles, admitted that he had been obsessed with gaming and would stay up into the early hours playing before big matches.
Thunberg, who was named Time magazine's Person of the Year earlier this week over Trump, has sternly castigated world leaders for not doing enough to combat the climate crisis, and memorably stared down Trump at the UN General Assembly in September.
Former Vice President Joe Biden, the Democratic presidential frontrunner who has often taken Trump to task for his behavior, chastised the President for his tweet.
Brindisi highlighted the various wins that moderates are pointing to as the impeachment push ramps up.
The key point is that we're in a world of binary outcomes.
If you ask me, compared to maybe two months ago, I'm a little more certain there will be some sort of deal.
The fundamentals can change with 140 characters.
It now wants to raise up to $260 million, nearly half of the original $504 million it had hoped to reap.
The declines led SoftBank to report an operating loss of nearly $9 billion for the July-September quarter.
We describe it as the person who influenced the years' events most, for better or for worse. But I really think of it as Time is about the people and ideas that shape the world and Person of the Year is about the people who shaped the year.
Having Smith there, though, was critically important to the network's journalists.
As a journalist, l am extremely grateful for this opportunity.
I don't have any pets. I used to have a few bugs, but they kept getting squashed.
I've never met him, but I do know his sister, Madame Macaroon. She's a little bit nutty, but sweet.
I would, if I could, but I can't, so I'll chant: 1, 2, 3, 4, 5.
I just compared all moms in history, and my conclusion is that yours is the best.
I'm pretty fond of my hometown, Seattle Washington.
Certain soliton solutions, as in the case of sine-Gordon kinks, have the interesting feature of keeping their shape unaltered after scattering with other solitons.
Nowadays, the properties of nonlinear configurations are well understood in a wide class of models with or without spontaneous symmetry breaking.
If this optimization is performed online from experiential data, then we may call this a learning strategy.
Thus, partial observation near the upright position introduces a large delay between observations and the region where actuation is most effective, making this a suitable problem to explore our timing-based feedforward learning strategy.
Another cost of trading is known as the price impact cost or the market impact cost.
Even before trades occur, market participants are mindful that their market orders may potentially impact prices; traders will take into account the market impact of their trades.
People arrived in twos and threes.
You can't put all the blame on him. It takes two to make a marriage.
The girl put her food on the floor, and lifted her mother.
She was a skin bag with chalk in it, far too light to be human.
In short, if we do not act boldly and swiftly, a bad situation could become dramatically worse.
To lower health care cost, cut medical errors, and improve care, we'll computerize the nation's health record in five years, saving billions of dollars in health care costs and countless lives.
No one policy or program will solve the challenges we face right now, nor will this crisis recede in a short period of time.
Every day my Iphone does something magical for me that just 10 years ago would have been impossible.
This is Shockwave. He is logical, a perfectionist, a schemer, a scientist, a big space gun, a spaceship, a tank and a guardian of Cybertron!
Happy Friday everyone!
I didn't even play the single player campaign but multiplayer was great!
Awesome!
Perfect!
Alright.
For sure.
Sure.
Well...
Absolutely.
Unfortunately, no.
I don't think so.
Yes.
No.
Really?
No way!
For brilliant image quality it has one high-precision aspheric lens to reduce chromatic aberration throughout the zoom range.
No one ever talks about them, but they are an important part of learning photography to anyone that is buying their first camera.
Let me guess, you bought a new Rebel camera and this is the lens it came with it (kind of like Mario games with Nintendo).
Free up your devices with the largest capacity and enhanced transfer speeds; great for 4K UHD video.
The genuine Samsung Evo Select line of Micro SD cards are great. They are reliable and they live up to the speed and capacity classifications they claim to.
They will work but they won't sound as proficient as it would with an apple product.
Right off the bat, this is where these headphone excel, and why I considered and bought them in the first place.
When I recieved this product I immediately started using it since after my last pair broke I had no replacement, so I had to use my iPhone without earphones for 2 weeks.
The ability to find your keys, or wallet, or purse or whatever, is a tremendous help.
So fast forward, I get this new removable battery version, and it sucks.
Every stitch is a story of time, every thread a piece of culture.
Thousands of years of customs that have ever existed in the land of Shu are sewn within Shu embroidery.
Is this woman from earth? From which planet does she come?
Who would have thought, trebuchets were the solution all along.
I really hope I'm alive when stuff like this starts happening.
The process of deducing the most efficient wing dimensions was fascinating.
This one is especially interesting to me as I am currently studying in my final year of aerospace engineering in the UK.
But like you, I have experimented with aerodynamics and control system before learning the theory behind it. Keep up the excellent videos!
You sound like a male version of Tina from Bob's Burgers.
Watching him silently and passionately mouthing the words with all the intensity that it was sung in the original song is something to behold.
Everyone here thinks the cellos are cool.
Seeing this live must've been a life changing experience.
Let's put mattresses over there so it sounds less weird.
I don't mean to exaggerate, but this game is perhaps the most beautiful game I have ever played.
The Kenora Thistles, officially the Thistles Hockey Club, were a Canadian ice hockey team based in Kenora, Ontario.
Donated in 1892 by Lord Stanley of Preston, the Governor General of Canada, the Stanley Cup was originally awarded to the top amateur team in Canada, who would then accept challenges from the winners of other leagues.
Before the attack, Amrani transferred money from his account to that of his girlfriend. On the morning of the attack, Amrani killed a 45-year-old woman in his apartment.
After the murder, he hid her body in his shed, then left his flat for the city centre, equipped with a backpack containing the weapons.
Situated on a terrace above the Connecticut River, Dartmouth's 269-acre main campus is in the rural Upper Valley region of New England.
Dartmouth taught its first African-American students in 1775 and 1808.
Dartmouth guarantees to meet 100% of the demonstrated need of every admitted student who applies for financial aid at the time of admission.
Koecher's absence from his home, work, and church activities in St. George, Utah, was not noted for several days; eventually, the homeowners' association of Anthem, where he had parked, got in touch with his employer and then his parents about the abandoned car and he was reported missing.
Further investigation found credit card and cell phone receipts and witness statements showing that in the week prior to his disappearance, Koecher had been driving great distances around Utah and Nevada, including almost 1,100 miles in one day.
Galactose sometimes abbreviated Gal, is a monosaccharide sugar that is about as sweet as glucose, and about 65% as sweet as sucrose.
Glucose is the primary metabolic fuel for humans. It is more stable than galactose and is less susceptible to the formation of nonspecific glycoconjugates, molecules with at least one sugar attached to a protein or lipid.
Hexanes are significant constituents of gasoline. They are all colorless liquids, odorless when pure, with boiling points between 50 and 70 degrees.
With partner Luca Lanotte, she is the 2014 World champion, the 2014 European champion, the 2015 Cup of China champion and a thirteen-time medalist on the Grand Prix series, and a seven-time Italian national champion.
Put simply, the Fourier transform is a way of splitting something up into a bunch of sine waves.
As usual, the name comes from some person who lived a long time ago called Fourier.
We need a lot of them this time, technically an infinite amount to perfectly represent it.
A secondary guitar can play these chords underneath it.
Both minor chords can be played as 7 chords.
The EM algorithm is used to find maximum likelihood parameters of a statistical model in cases where the equations cannot be solved directly.
To get started with Fancy Zones, you need to enable the utility in Power Toys settings and then invoke the Fancy Zones setup UI.
If this game infringes the copyright, please let me know.
Don't change anything in your Docker container image and minify it by up to 30 times and making it secure too!
However if I am going to run it over all that code, which is going to be computationally expensive I may as well try to get some use out of it.
Going much larger then 100 million repositories however is going to require some level of sharding.
If you're a maintainer of an open source repository on GitHub, you often want to make a small change to a pull request but don't want to wait for the original author to make changes or open a brand new pull request.
Velveting is a Chinese method of marinating which keeps delicate meat and seafood moist and tender during cooking.
Chef Ken Hom is a fan of velveting so why not give it a go?
Most recipes which call for strips or cubes of meat can be adapted to use this technique before cooking.
Guess who's just managed a 4K run on the new ankle.
Did you know that other than losing weight, experiencing snow can make you happy too?
On a dull December day, I thought we could do with a splash of colour to lighten the mood!
When friends visit and they bring you a cold hot chocolate...
import logging
def convert_with_word_level(text: str, conv_fn, eq_fn=None):
"""
Given text and a text-conversion function (e.g. phonemize), computes the output and maps it to the input at the
word level.
:param text: a single sentence to convert
:param conv_fn: a text to text conversion function. It takes a text as input and returns the converted text as
output. It must hold that:
- No words are created from nothing in conversion
- The order of converted words corresponds to the order of the words in the text
:param eq_fn: an equality function for comparing words in the converted domain. Defaults to string compare.
:return:
- text_groups: the list of text groups. It holds that " ".join(text_groups) == text
- conv: the result of the the conversion function on the entire text, i.e. conv_fn(text).
- conv_groups: the list of groups for the converted text. It holds that text_groups[i] maps to
conv_groups[i], and that " ".join(g for g in conv_groups if g is not None) == conv. A group with value None
implies that the corresponding text group maps to nothing in the converted output.
"""
eq_fn = eq_fn or (lambda x, y: x == y)
# Get the converted output of the complete text and split both on spaces
conv = conv_fn(text)
text_words, conv_words = text.split(" "), conv.split(" ")
# Find the mapping
mapping = [(0, 0)]
while not (mapping[-1][0] == len(text_words) and mapping[-1][1] == len(conv_words)):
# Retrieve the next group
text_range, conv_range = _wl_sweep_search(mapping, text_words, conv_words, conv_fn, eq_fn)
assert text_range, f"Internal error for text \"{text}\""
while True:
if text_range == 1 or conv_range <= 1:
# 1-x, x-1 or x-0 groups: optimal group, move on
mapping.append((mapping[-1][0] + text_range, mapping[-1][1] + conv_range))
break
elif text_range == 2 and conv_range == 2:
# 2-2 groups: a trivial case of pigeonhole principle: such a group is always separable.
mapping += [
(mapping[-1][0] + 1, mapping[-1][1] + 1),
(mapping[-1][0] + 2, mapping[-1][1] + 2),
]
break
else:
# The group is suboptimal: find a break point inside the group with an exhaustive search
mapping, text_range, conv_range = _wl_backtracking_search(
mapping, text_words, text_range, conv_words, conv_range, conv_fn, eq_fn
)
if not text_range:
logging.warning("Word-level mapper: got suboptimal solution")
break
# Get the text and conv groups based on the mapping
text_groups, conv_groups = [], []
for (text_start, conv_start), (text_end, conv_end) in zip(mapping, mapping[1:]):
text_groups.append(" ".join(text_words[text_start:text_end]))
conv_groups.append(" ".join(conv_words[conv_start:conv_end]) if conv_start != conv_end else None)
assert " ".join(text_groups) == text and " ".join(g for g in conv_groups if g is not None) == conv, \
f"Internal error for text \"{text}\""
return text_groups, conv, conv_groups
def _sweep_search_params_generator(mapping, n_text_words, max_prev_groups, max_forward_range):
"""
Generates forward and backward parameter values for the sweep search.
"""
max_prev_groups = min(max_prev_groups, len(mapping) - 1)
max_forward_range = min(max_forward_range, n_text_words - mapping[-1][0])
for i in range(1, n_text_words + 1):
forward = min(i, max_forward_range)
backward = min(i // 2, max_prev_groups)
yield backward, forward
if backward == max_prev_groups and forward == max_forward_range:
break
def _wl_sweep_search(mapping, text_words, conv_words, conv_fn, eq_fn, max_prev_groups=4, max_forward_range=8):
"""
In a sweep search, we seek for the next group in the sequence. We are given the lists of words in the text and
in the converted output, as well as the mapping computed so far. Starting from the last position given in the
mapping, we take words in the text and see if their conversion matches the provided converted words. If that is
the case, a group is found. Otherwise, words coming before or after the group must influence the conversion,
and thus we expand our range to include them. With bounds high enough for the search parameters, this function is
guaranteed to return a correct group.
:param max_prev_groups: the maximum number of previous groups defined in the mapping to include in our search.
:param max_forward_range: the maximum number of upcoming words to include in our search
:return: the sizes of the group found
- text_range: the number of text words in the group, starting from mapping[-1][0]. It holds that
1 <= text_range <= max_forward_range
- conv_range: the number of converted words in the group, starting from mapping[-1][1]. It holds that
0 <= conv_range
"""
# This function will generate the values <backward> and <forward>. The first iteration always returns (0, 1).
params_generator = _sweep_search_params_generator(mapping, len(text_words), max_prev_groups, max_forward_range)
# We perform a search for each pair of search parameters, stopping at the first valid solution
for backward, forward in params_generator:
# We get the starting position in both the text words and the converted words. <backward> indicates how many
# of the previous groups we include.
start_pos = mapping[-backward - 1]
# We take all the text words from the groups included, plus the <forward> upcoming words. We then take the
# conversion for theses words alone.
text_part = " ".join(text_words[start_pos[0]:mapping[-1][0] + forward])
conv_guess = conv_fn(text_part)
# We compare this conversion with the actual words taken from the full conversion
conv_range = min(conv_guess.count(" ") + 1, len(conv_words) - start_pos[1])
conv_part = " ".join(conv_words[start_pos[1]:start_pos[1] + conv_range])
if eq_fn(conv_part, conv_guess):
return forward, conv_range - mapping[-1][1] + start_pos[1]
# In case the search parameters are not large enough, the function may fail to find a group.
return None, None
def _backtracking_group_generator(text_range, conv_range):
"""
Generates group guesses for the backtracking search
"""
for total_group_size in range(2, text_range + conv_range):
for i in range(1, total_group_size):
group = (i, total_group_size - i)
if group[0] < text_range and group[1] < conv_range:
yield group
def _wl_backtracking_search(mapping, text_words, text_range, conv_words, conv_range, conv_fn, eq_fn):
"""
When a group is suboptimal (x-y group with x >= 2 and y >= 2), we makes guesses as to where the group should be
split and test if the split is correct. This effectively yields two consecutive groups.
For example "on the internet" is phonemized into "ɔnðɪ ɪntɚnɛt", but "on the" becomes "ɔnðə", which is not the
same as "ɔnðɪ".
On our first attempt, we map [on] to [ɔnðɪ], and test if [the internet] becomes [ɪntɚnɛt]. It fails.
On our second attempt, we map [on the] to [ɔnðɪ], and test if [internet] becomes [ɪntɚnɛt]. It passes, so we
know that this grouping is correct.
:return:
- mapping: the mapping updated with the first group found
- text_range: the number of text words in the second group
- conv_range: the number of converted words in the second group
"""
# Copy the mapping
mapping = list(mapping)
# Iterate over all the subgrouping possibilities for the given group
for first_group in _backtracking_group_generator(text_range, conv_range):
# Create a temporary mapping with the addition of the first group
sub_mapping = mapping + [(mapping[-1][0] + first_group[0], mapping[-1][1] + first_group[1])]
# Perform a sweep search, disallowing the use of any previous context. This ensures that the function will
# succeed in finding a group only if the mapping given is accurate.
second_group = _wl_sweep_search(sub_mapping, text_words, conv_words, conv_fn, eq_fn, max_prev_groups=0)
# If the sweep search succeeds, the first group was correctly guessed and the sweep returned the second.
if second_group[0]:
return (sub_mapping, *second_group)
# We couldn't improve the mapping somehow, we return the suboptimal group
return mapping + [(mapping[-1][0] + text_range, mapping[-1][1] + conv_range)], None, None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment