Skip to content

Instantly share code, notes, and snippets.

@rygorous
Last active July 13, 2021 18:28
Show Gist options
  • Save rygorous/1d3c1614f9fab50149502e3339ebde83 to your computer and use it in GitHub Desktop.
Save rygorous/1d3c1614f9fab50149502e3339ebde83 to your computer and use it in GitHub Desktop.
BiDi processing excerpt
// Rule N0:
// "Any number of characters that had original bidirectional character type NSM prior to
// the application of W1 that immediately follow a paired bracket which changed to L or R
// under N0 should change to match the type of their preceding bracket."
//
// We don't store the character types as they were before W1 persistently. However, we do
// store the initial character types before any of the rules get applied (since they're
// required for processing of rules L1/L2). Now the only relevant way that a NSM type
// can get modified prior to rule W1 is due to directional overrides. But this can't
// affect us: if there's directional overrides active around the bracket pair, e.g.
//
// RLO ( a ) NSM
//
// then the character types for the brackets themselves would get overwritten and the
// PBA wouldn't trigger. This also goes for any positioning of an explicit override that
// affects one bracket but not the other. The only remaining possible position for an
// override that might cause an initial NSM to get overridden is something like this:
//
// ( a ) RLO NSM
//
// But an override in this position breaks the active level run; thus, the paired brackets
// and the following NSM wouldn't be part of the same isolating run sequence, so again
// we shouldn't trigger in that case. (To be even more specific: an overflow override
// wouldn't break the level run, but it also wouldn't enable directional override
// behavior in the first place, so we're still good.)
//
// Long story short: it's OK to use the flags derived from initial types since whenever
// an initial NSM got overwritten prior to W1, this path won't run.
static void ihud_bidi_resolve_paired_bracket(ihud_bidi_info *b, ihud_bidi_index start, U8 resolved_type, ihud_bidi_index irs_end)
{
ihud_bidi_index i;
b[start].type = resolved_type;
// Resolve a run of (NSM|BN)+ following the bracket
for (i = ihud_bidi_next(b, start); i < irs_end && (b[i].flags & (FLG_NSM | FLG_X9R)); i = ihud_bidi_next(b, i))
b[i].type = resolved_type;
}
static void ihud_bidi_resolve_weak_types_and_paired_brackets(ihud_bidi_info *b, const U32 *text, ihud_bidi_irs *irs)
{
// Our implementation of the Paired Bracket Algorithm needs to keep track of two things
// per-character:
// - Whether the most recent strong type output by the weak types phase was
// classified L or R (for the purposes of the PBA, EN and AN count as R).
// - Whether the current run (since the last opening/closing bracket) contains
// a strong type matching the embedding direction, a strong type opposite the
// embedding direction, or neither (neutral).
// We turn this into a single state machine. Since this depends on the current
// embedding direction, we make it part of the state too.
#define STATE_ID(prev_strong,embed_dir,run_type) ((prev_strong) + (embed_dir)*2 + (run_type)*4)
#define GET_PREV_STRONG(state_id) ((state_id)&1)
#define GET_EMBED_DIR(state_id) (((state_id)>>1)&1)
#define GET_RUN_TYPE(state_id) ((state_id)>>2)
static const U8 pba_state_table[2*2*4][8] = { // [cur_state][type]
// There's only three possible canonical inputs (L, R, N); EN and AN count as R.
// Also, AL and ES/ET are never produced as output of the weak types phase.
// We only including types 6 and 7 to make table rows contain a pow2 number
// of elements.
// Inputs (in order): L, R, (unused), NI, EN, AN, (unused), (unused)
#define ROW(next_L,next_R,next_N) { (next_L), (next_R), (next_N), (next_N), (next_R), (next_R), (next_N), (next_N) }
#define STATE(cur_strong,embed_dir,cur_run_type) ROW( \
STATE_ID(BIDI_L, embed_dir, (embed_dir==BIDI_L || cur_run_type==BIDI_NI) ? BIDI_L : cur_run_type), \
STATE_ID(BIDI_R, embed_dir, (embed_dir==BIDI_R || cur_run_type==BIDI_NI) ? BIDI_R : cur_run_type), \
STATE_ID(cur_strong, embed_dir, cur_run_type) \
)
// prev_strong embed_dir run_type
STATE(BIDI_L, BIDI_L, BIDI_L),
STATE(BIDI_R, BIDI_L, BIDI_L),
STATE(BIDI_L, BIDI_R, BIDI_L),
STATE(BIDI_R, BIDI_R, BIDI_L),
STATE(BIDI_L, BIDI_L, BIDI_R),
STATE(BIDI_R, BIDI_L, BIDI_R),
STATE(BIDI_L, BIDI_R, BIDI_R),
STATE(BIDI_R, BIDI_R, BIDI_R),
STATE(BIDI_L, BIDI_L, BIDI_NI), // (unused)
STATE(BIDI_R, BIDI_L, BIDI_NI), // (unused)
STATE(BIDI_L, BIDI_R, BIDI_NI), // (unused)
STATE(BIDI_R, BIDI_R, BIDI_NI), // (unused)
STATE(BIDI_L, BIDI_L, BIDI_NI),
STATE(BIDI_R, BIDI_L, BIDI_NI),
STATE(BIDI_L, BIDI_R, BIDI_NI),
STATE(BIDI_R, BIDI_R, BIDI_NI),
#undef STATE
#undef ROW
};
ihud_bidi_pb pb_stack[BIDI_PBA_MAX_DEPTH + 1];
ihud_bidi_index pb_stack_ind = 0;
ihud_bidi_index deferred_resolve_start = MAX_INDEX;
U8 deferred_resolve_type = 0; // just so it's initialized
ihud_bidi_index i, mark_i;
U8 st, o;
U8 embed_strong_type, pba_state;
ihud_bidi_index weak_tos;
RR_ASSERT(irs->sos == BIDI_L || irs->sos == BIDI_R);
pba_state = STATE_ID(irs->sos, BIDI_L + (irs->level & 1), BIDI_NI);
embed_strong_type = pba_state;
st = ihud_bidi_weak_state_machine[0].next[irs->sos];
mark_i = irs->first_ind; // just make sure it's initialized
weak_tos = MAX_INDEX; // start with clear weak bracket stack
for (i = irs->first_ind; i < irs->end_ind; i = ihud_bidi_next(b, i)) {
U8 orig_t = b[i].type; // initial type (used by Paired Bracket Algorithm)
U8 t = orig_t;
// Characters removed by rule X9 should not affect the weak state machine or the
// paired bracket algorithm, and are treated as neutrals by later stages.
if (t < BIDI_BN) {
t = (t < BIDI_LRI) ? t : BIDI_NI; // Pre-canonicalization: fold all Neutral/Isolating chars to BIDI_NI
// The entire "weak types" phase can be performed by a finite-state transducer
// (more precisely, a Moore machine) described by the table above (which is
// generated from an algorithmic description of the various weak type rules
// by tblgen.cpp).
//
// The transducer is basically a finite-state automaton that first performs
// a state transition based on the current input symbol, then outputs a result
// type based on the current state (after the transition).
//
// For most of the weak types, it should be straightforward to see how the
// rules map into this framework. The tricky part are rules W4 and W5, which
// (as described in the spec) require lookahead to decide which rule matches.
//
// Luckily, all cases where the lookahead matters result in a run of identical
// types. So we do something slightly different: in all look-ahead cases, the
// primary state machine generates an output symbol assuming that the rule
// will not match, but remembers the current position (this is "mark_i").
// If the following characters then mean that the look-ahead rule for W4/W5
// would have matched after all, we update all the output types between
// mark_i and the current cursor to their new value (which is the output of
// that state). It turns out that this formalism is sufficiently expressive
// for the weak types phase.
// Perform transition and handle output
st = ihud_bidi_weak_state_machine[st].next[t];
o = ihud_bidi_weak_state_machine[st].output;
t = o & 0x3f;
pba_state = pba_state_table[pba_state][t];
if (o & 0xc0) { // is there a mark action queued?
if ((o & 0xc0) == 0x40) // set mark
mark_i = i;
else { // resolve run from mark to current cursor
for (; mark_i < i; mark_i = ihud_bidi_next(b, mark_i))
b[mark_i].type = t;
}
}
} else
t = BIDI_XX; // tag deleted chars
b[i].type = t;
RR_ASSERT(t <= BIDI_AN);
// Paired bracket handling (rule N0)
//
// To understand how this works, let's first consider a simplified version. The actual paired
// bracket algorithm resolves brackets outside-in (ordered by their opening bracket position),
// but let's pretend for a minute that it resolves the brackets inside-out (ordered by their
// closing bracket position). In that setting, it is very natural to process characters in
// logical (memory) order, keep a stack of open brackets, and decide whether bracket pairs
// are L, R, or truly neutral at the time the closing bracket is encountered (i.e. when it's
// popped off the stack).
//
// Now between any pair of brackets, we want to know which (if any) strong types occur anywhere
// between them. The possible answers are, ordered by increasing priority:
// 1. There is no strong type anywhere between the bracket pair.
// 2. There are strong types opposite the current embedding direction ("o" in the spec), but none
// matching the embedding direction ("e" in the spec).
// 3. There are strong types matching the embedding direction.
//
// This is what the code calls the "run type". The run type can be one of L, R, or NI (standing
// for neutral). Whether L "beats" R or vice versa depends on the embedding direction.
//
// To know the final run type for all characters inside a bracket pair, we track two separate
// things:
// 1. The run type of all characters we've processed since the last bracket character. Any
// bracket character, open or closed brackets both. This is the run type inside "pba_state"
// and we reset it to NI (neutral) after every bracket character we encounter.
// 2. The run type for *all* characters we've seen since the opening bracket at the current top
// of the bracket stack. This is in "embed_strong_type". This includes all runs we've
// processed since then, including runs in nested brackets that we've since popped off the
// open bracket stack.
//
// pba_state is updated on a run-by-run basis; embed_strong_type accumulates information both from
// runs at the current level, and "inherits" information from deeper-nested levels. This takes care
// of rule N0.a: "Inspect the bidirectional types of the characters enclosed within the bracket pair".
//
// What happens next depends on the resulting embed_strong_type. If it's neutral, we're in case
// N0.d: the brackets stay neutral, nothing to do. If it's a strong type matching the embedding
// direction, we're in case N0.b: resolve the brackets to match the embedding direction. Also
// straightforward. So far, we're making these decisions purely based on what character types were
// within the bracket pair after the weak types phase, and nothing depends on the order we resolve
// brackets in. So even though we're processing inside-out instead of outside-in, for the cases
// discussed so far, we're doing the right thing.
//
// Rule N0.c is what makes things tricky; in particular, the "test for an established context with
// a preceding strong type" bit. Now, the "preceding strong type" can, for the most part, be
// answered by keeping track of the last strong type we've seen - which we also do in "pba_state"
// (the "prev strong" part). But there's a catch. Suppose we're looking at a case like this, inside
// a run with R embedding direction, and we're at the marked position (the *):
//
// L ( ( L )* R )
//
// We have a bracket with contents opposite the embedding direction, and the preceding strong type
// (outside both bracket pairs) matches - check and check. So we want to resolve the inner bracket
// pair to R, and that's what we do.
//
// But it turns out that, if we were to (correctly!) resolve the brackets from the outside in, we'd
// notice that the *outer* bracket pair contains a R-type character, and will thus cause both outer
// brackets to get resolved to R - which occurs after the L that we used to establish preceding
// context for the inner bracket. So once we close the outer (containing) bracket, we realize our
// mistake: the inner bracket pair should have been R after all.
//
// Luckily, resolving a bracket incorrect to the opposite of the embedding direction when it should
// have been equal to the embedding direction is the only such mistake we can make. If a bracket
// pair was deemed to have neutral contents (N0.d), it will always stay neutral. If we resolved it
// to the embedding direction, it will never get updated again either.
//
// So we can salvage this: if we're in a case where the resolved direction of a containing bracket
// (that we haven't resolved yet) would cause our resolved-to-opposite-the-embedding-direction
// bracket to flip sense (because such a bracket would establish a different preceding strong type),
// we need to be careful. This is the case when
// a) we're about to resolve a bracket to opposite the embedding direction,
// b) that bracket is nested inside at least one currently unresolved bracket,
// c) there were no strong-type characters between the containing and the nested openings brackets
// (which we can infer from pba_state at the time), meaning that whatever strong type the outer
// bracket gets resolved to will "cascade through" and establish a preceding strong type for the
// inner bracket.
//
// We refer to such bracket pairs as "weak brackets", and we keep a stack of them (using the existing
// link[] fields also used to connect pieces of IRSes; we only link ON-type characters here whereas
// the IRS links are on isolate initiators and BNs, so there is no overlap).
//
// Whenever an outer containing bracket gets resolved to a strong type matching the embedding dir,
// we pop all contained weak brackets and re-resolve them to match the embedding dir as well. If
// we pop all the way down to have no remaining open brackets, we can clear the stack: with no more
// prior positions (namely, positions of yet-unmatched open brackets) that might change their type
// and establish a different preceding strong type than before, all decisions made when the weak
// brackets were initially pushed on their stack are now final.
if (orig_t >= BIDI_ONpba_first && orig_t <= BIDI_ONpba_last && pb_stack_ind <= BIDI_PBA_MAX_DEPTH) {
U32 cp = text[i]; // current code point
// Normalize: as of Unicode 9.0.0, there is exactly one canonically equivalent bracket pair.
// (This special case is hardcoded in both ICU and the C BiDi ref impl brrule.c, so I don't feel
// guilty about handling it as a hardcoded special case here.)
if (cp >= 0x3008 && cp <= 0x3009)
cp = (cp - 0x3008) + 0x2329;
// if we have a deferred resolve, now is a good time to do it
if (deferred_resolve_start != MAX_INDEX) {
ihud_bidi_resolve_paired_bracket(b, deferred_resolve_start, deferred_resolve_type, irs->end_ind);
deferred_resolve_start = MAX_INDEX;
}
if (orig_t == BIDI_ONcpb) {
// Close bracket. Try to find matching one on stack.
ihud_bidi_index j = pb_stack_ind;
while (j-- > 0) {
ihud_bidi_index k;
ihud_bidi_pb *pbs = &pb_stack[j];
if (cp != pbs->close_cp)
continue;
// Fold run type since last bracket into embed_strong_type
embed_strong_type = pba_state_table[embed_strong_type][GET_RUN_TYPE(pba_state)];
// Pop back down to this level. To do this, we need to fold the previosly saved
// embedded strong type resolutions for the inbetween paired brackets we're
// closing into the current embed_strong_this_run (since they all count as within
// the bracket pair we've identified.)
for (k = pb_stack_ind; k > j + 1; --k)
embed_strong_type = pba_state_table[embed_strong_type][pb_stack[k - 1].str];
// Rule N0.b/N0.c: if we saw a strong type, resolve bracket type. If the strong type matches
// embedding dir, use it. If it doesn't, check preceding context. If both preceding
// context and text inside bracket agree, may use strong type disagreeing with
// embedding direction.
if (GET_RUN_TYPE(embed_strong_type) != BIDI_NI) {
// The embed strong type right now can either match the embedding direction, or be
// opposite it. If it matches, this computation simply returns the embedding
// direction (since it "wins" in case of ties). If it is opposite, the result
// depends on the saved pbs->pba_state, which contains the preceding strong type
// at the time the of the matched opening bracket.
//
// If the preceding strong type at the time of the matched opening bracket was
// equal to the embedding direction, that decides it: the resolved type is always
// going to be the embedding direction. If it was opposite, resolve to the
// opposite of the embedding direction, but see notes on weak brackets below.
U8 resolved_type = GET_RUN_TYPE(pba_state_table[embed_strong_type][GET_PREV_STRONG(pbs->pba_state)]);
// Rule N0: handling of NSMs following paired brackets
ihud_bidi_resolve_paired_bracket(b, pbs->open_ind, resolved_type, irs->end_ind);
// Defer resolving of close bracket: we haven't run the weak state machine for
// the following types yet. N0 runs after the weak types phase, so we wait to
// finish weak types processing for the characters that follow the close bracket
// before we perform the resolve (which might override the result types for
// characters that were NSMs).
deferred_resolve_start = i;
deferred_resolve_type = resolved_type;
TRACE("(%u,%u) paired bracket! resolved=%d embedding_dir=%d\n", pbs->open_ind, i, resolved_type, irs->level & 1);
// We resolved to a strong type; this establishes context for later brackets.
pba_state = STATE_ID(resolved_type, GET_EMBED_DIR(pba_state), BIDI_NI);
// Is the resolved type equal or opposite to the current embedding direction?
if (resolved_type == GET_EMBED_DIR(pba_state)) { // equal
// A containing resolved bracket matching the current embedding direction establishes
// context for all contained weak brackets; time to RE-RESOLVE THEM ALL!
// Once we've resolved them to the embedding direction, these brackets can never change
// type again, so we can discard their weak bracket records. So just keep popping down
// the stack.
while (weak_tos != pbs->open_weak_tos) {
TRACE(" re-resolve weak bracket at %u\n", weak_tos);
ihud_bidi_resolve_paired_bracket(b, weak_tos, resolved_type, irs->end_ind);
weak_tos = b[weak_tos].link;
}
} else { // opposite
// If the resolved type is opposite the current embedding direction and there are other
// brackets deeper on the stack, this is a "weak" decision: we can later find out that
// one of the containing brackets is strongly in the embedding direction, which in turn
// will retroactively establish context (as per Rule N0c) for the current bracket and make
// it flip direction. So if this is a weak decision, record the position of the opening and
// closing brackets so we can fix it up later in case we have to.
if (j > 0) { // other brackets are deeper on the stack; might have to revise.
// If there was no strong type between the containing opening bracket and this one,
// we have a weak bracket.
if (GET_RUN_TYPE(pbs->pba_state) == BIDI_NI) {
TRACE(" push weak bracket pair (%u,%u)\n", pbs->open_ind, i);
// Push onto weak bracket stack.
// NOTE: safe to use "link" for this since weak brackets only occur on ONs, whereas
// skips only occur on isolate initiators or chars removed by X9.
b[i].link = pbs->open_ind;
b[pbs->open_ind].link = weak_tos;
weak_tos = i;
}
} else {
// If the bracket we're closing is the last one on the stack, we can discard all pending
// weak bracket records; they cannot change anymore (since there are no more un-closed
// open brackets left that could establish prior context post-resolution)
weak_tos = MAX_INDEX;
TRACE(" discard weak brackets\n");
}
}
}
// Finally, merge strong type status from before this bracket pair into
// the current state, now that we actually pop.
embed_strong_type = pba_state_table[embed_strong_type][pb_stack[j].str];
TRACE(" post-pop estc=%d\n", GET_RUN_TYPE(embed_strong_type));
pb_stack_ind = j; // pop!
break;
}
} else {
// open bracket
static const signed char close_offs[4] = { 1,2,3,-1 }; // matches ON_opb*
ihud_bidi_pb *pbs = &pb_stack[pb_stack_ind++];
// Fold run type since last bracket into embed_strong_type
embed_strong_type = pba_state_table[embed_strong_type][GET_RUN_TYPE(pba_state)];
TRACE("open bracket pos=%u estc=%d\n", i, GET_RUN_TYPE(embed_strong_type));
RR_ASSERT(orig_t >= BIDI_ONopb1 && orig_t <= BIDI_ONopbn1);
pbs->open_ind = i;
pbs->open_weak_tos = weak_tos;
pbs->close_cp = cp + close_offs[orig_t - BIDI_ONopb1];
pbs->pba_state = pba_state;
pbs->str = GET_RUN_TYPE(embed_strong_type); // saved type from containing bracket
pba_state = STATE_ID(GET_PREV_STRONG(pba_state), GET_EMBED_DIR(pba_state), BIDI_NI); // reset run type to neutral
embed_strong_type = pba_state;
}
}
}
// NOTE: Rules W4/W5 might be in a lookahead state waiting for a following EN/AN,
// which gets resolved once we feed in eos.
//
// However, the current state machine implementation of these rules assumes there is
// no following EN/AN (and outputs an ON type); in case that turns out to be wrong, we
// mark the position where we need to start fixing up and override the whole run.
// But eos is always L or R, neither of which matches EN/AN, so at this point there's
// simply no chance that the tentative decision for ON can get revised.
// If we have a pending deferred bracket resolve, handle it
if (deferred_resolve_start != MAX_INDEX)
ihud_bidi_resolve_paired_bracket(b, deferred_resolve_start, deferred_resolve_type, irs->end_ind);
#undef STATE_ID
#undef GET_PREV_STRONG
#undef GET_EMBED_DIR
#undef GET_RUN_TYPE
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment