wchargin/Cargo.toml

## Cargo.toml
[dependencies]
pulldown-cmark = "0.8.0"

## crosswalk.rs
//! Cross-references that are robust to referent renames.
//!
//! In a body of Crosswalk text, references can be detected, resolved, and expanded with only
//! simple string algorithms; Markdown parsing is only needed for encoding. If a referent is
//! renamed or otherwise updated, no changes are needed to Crosswalk text. The Crosswalk
//! encoding/decoding cycle is lossless except for the minimal updates needed to reflect changes in
//! referents.
//!
//! Thus, Crosswalk is suited for the following flow:
//!
//! -   When an author writes Markdown text, encode it into Crosswalk and store the resulting blob
//!     in the database. This requires application-specific, Markdown-aware logic to identify
//!     cross-references and resolve their canonical IDs.
//!
//! -   When a viewer wants to render that text, use simple string manipulation to identify
//!     references, and then some application-specific, Markdown-*unaware* logic to resolve those
//!     to referents in the database. With PostgreSQL or similar, this can all be done with a
//!     simple user-defined function within the database layer. More generally, it can be done on
//!     the application backend at the cost of an extra database query: one to fetch the Crosswalk
//!     text, and one more to fetch data for all referents. The result is Markdown source; render
//!     it as desired.
//!
//! -   When the author wants to edit that text, decode the Crosswalk as before, and present the
//!     Markdown source for editing instead of rendering it.
//!
//! It's called Crosswalk because it makes it easy to walk occurrences of cross-references.
//!
//! # Encoding
//!
//! Crosswalk has one metacharacter, `@`. Tokens are:
//!
//! -   `@z`: Literal `@` in the source (anywhere, including in code blocks, link labels, etc.).
//! -   `@{REF}`: Cross-reference. The string `REF` should match `[^@{}]*`.
//! -   `@[HEADER]LOOKBEHIND`: Special affordance for Markdown shortcut/collapsed links; see below.
//!
//! So, this Markdown document:
//!
//! ```markdown
//! See [@alice's post about SETI@home][seti@home] for details.
//!
//! [seti@home]: https://user@example.com/index.html
//! ```
//!
//! ...might be stored at rest as this Crosswalk text:
//!
//! ```markdown
//! See [@{user:123}'s post about SETI@zhome][seti@zhome] for details.
//!
//! [seti@zhome]: https://user@zexample.com/index.html
//! ```
//!
//! Finding references is as simple as finding all matches of the regex `@{([^}]*)}`, and then the
//! IDs (`user:123`) can be parsed according to the application semantics.
//!
//! The lexical syntax is chosen to be concise, evocative, and readable. As a bonus, it is also basically
//! Markdown-compatible, even with extensions that auto-render `user@example.com` as mailto links,
//! since `user@zexample.com` has the same syntactic structure and is still a valid email address.
//!
//! ## Shortcut and collapsed links
//!
//! The only tricky bit involves Markdown shortcut links and collapsed links, like `[LINK]` and
//! `[LINK][]`, respectively. The problem here is that the `LINK` text is reinterpreted in two
//! contexts: as an inline fragment that's actually rendered, and as a link label. Cross-references
//! should be resolved in the rendered text, but it's not clear that they should be resolved in
//! link labels, and most Markdown parsers don't make it easy to work with link labels, anyway: the
//! syntax trees that they emit are not concrete enough.
//!
//! The third token type addresses this issue. This Markdown document:
//!
//! ```markdown
//! Visit [@alice's blog]!
//!
//! [@alice's blog]: ...
//! ```
//!
//! ...might encode to:
//!
//! ```markdown
//! Visit [@{user:123}'s blog]@[16s][@zalice's blog]!
//!
//! [@zalice's blog]: ...
//! ```
//!
//! The sequence `@[16s][@zalice's blog]` is one Crosswalk token. It means: "to expand this token,
//! if the string `[@alice's blog]` has just been emitted to the output stream, then emit nothing
//! (for shortcut links) or `[]` (for collapsed links); otherwise, emit `[@alice's blog]`". The
//! effect is that if `@alice` is renamed to `@alicia`, then the document decodes to:
//!
//! ```markdown
//! Visit [@alicia's blog][@alice's blog]!
//!
//! [@alice's blog]: ...
//! ```
//!
//! ...which renders with the updated names and is structurally consistent with the original
//! source, without the need to directly inspect or modify the link reference definitions.
//!
//! More precisely, such a token is composed of:
//!
//! -   a literal `@[` sequence introducing the token;
//! -   an ASCII decimal number specifying the number of bytes in the UTF-8 encoding of the
//!     payload;
//! -   one of the characters `s` (for "shortcut") or `c` (for "collapsed"), indicating the link
//!     type;
//! -   a literal `]` character; and
//! -   the raw payload, with semantics as described above.
//!
//! # Grammar
//!
//! ```none
//! document ::= token*
//! token ::= literal | expansion | linkref
//!
//! raw ::= any single Unicode scalar value EXCEPT "@"
//! escape = "@z"
//!
//! literal ::= raw | escape
//!
//! expansion ::= "@{" (target: reference) "}"
//! reference ::= any string matching /[^@{}]*/
//!
//! linkref ::= "@[" (n: usize) linkref-sigil "]" (lookback: literal*)
//!     (where the UTF-8 length of `lookback` is `n`)
//! linkref-sigil ::= sigil-shortcut | sigil-collapsed
//! sigil-shortcut = "s"
//! sigil-collapsed = "c"
//! ```
//!
//! # Decoding semantics
//!
//! To decode a document, decode all tokens consecutively. To decode a token, if the token is...
//!
//! -   a raw character `c`: emit `c`;
//! -   an `@z` escape: emit `"@"`;
//! -   an expansion with reference `ref`: look up and emit the referent: e.g., if `REF` is
//!     `"user:123"`, this might emit `"@alice"`;
//! -   a link reference with kind sigil `k` and payload `p`:
//!     -   if `p` is not a suffix of the stream of emitted characters, emit `p`; else
//!     -   if `k` is `"s"` (shortcut link), emit nothing; else
//!     -   if `k` is `"c"` (collapsed link), emit `"[]"`.

use pulldown_cmark::{Event, LinkType, Parser, Tag};

#[derive(Debug)]
struct Splice {
    index: usize,
    len: usize,
    new: String,
}

fn escape_arrobas(s: &str) -> String {
    s.replace('@', "@z")
}

fn unescape_arrobas(s: &str) -> String {
    s.replace("@z", "@")
}

const SAMPLE_PATTERNS: &[(&str, &str)] = &[
    ("@zalice", "user:12345"),
    ("@zbartholomew", "user:7"),
    ("@zcheryl", "user:67890"),
    ("EXAMPLE", "ex"),
];

fn parse_splices(s: &str) -> Vec<Splice> {
    let mut result = Vec::new();
    let mut in_code_block = false;
    for (event, loc) in Parser::new(s).into_offset_iter() {
        match event {
            Event::Start(Tag::CodeBlock(_)) => in_code_block = true,
            Event::End(Tag::CodeBlock(_)) => in_code_block = false,
            Event::Text(text) if !in_code_block => {
                // stub logic for identifying and canonicalizing cross-references
                for (index, _) in text.char_indices() {
                    for &(needle, replacement) in SAMPLE_PATTERNS {
                        if !text[index..].starts_with(needle) {
                            continue;
                        }
                        result.push(Splice {
                            index: index + loc.start,
                            len: needle.len(),
                            new: format!("@{{{}}}", replacement),
                        });
                    }
                }
            }
            Event::End(Tag::Link(link_type, _, _)) => {
                let link_text_source = &s[loc.clone()];
                // If this link text contained no cross-references, nothing to do here.
                if result.last().map_or(true, |s| !loc.contains(&s.index)) {
                    continue;
                }
                let (kind, replace_len) = match link_type {
                    LinkType::Shortcut => ('s', 0),
                    LinkType::Collapsed => ('c', 2),
                    _ => continue,
                };
                result.push(Splice {
                    index: loc.end,
                    len: replace_len,
                    new: format!("@[{}{}]{}", link_text_source.len(), kind, link_text_source),
                })
            }
            _ => (),
        }
    }
    result
}

fn apply_splices(input: &str, splices: &[Splice]) -> String {
    let mut result = String::new();
    let mut read_up_to: usize = 0;
    for splice in splices {
        result.push_str(&input[read_up_to..splice.index]);
        result.push_str(&splice.new);
        read_up_to = splice.index + splice.len;
    }
    result.push_str(&input[read_up_to..]);
    result
}

fn decode(input: &str) -> String {
    let mut result = String::new();
    let mut read_up_to: usize = 0;
    for (i, _) in input.match_indices('@') {
        if i < read_up_to {
            // like the "@z" in "@[7s][@zaha]"
            continue;
        }
        result.push_str(&input[read_up_to..i]);
        read_up_to = i + 1;
        let suffix = &input[i + 1..];
        match suffix.as_bytes().first() {
            Some(b'z') => {
                result.push('@');
                read_up_to += 1;
            }
            Some(b'{') => {
                let close = suffix.find('}').expect("unmatched curly");
                let reference = &suffix[1..close];

                // stub logic for resolving a referent
                let replacement = SAMPLE_PATTERNS
                    .into_iter()
                    .find_map(|&(referent, id)| {
                        if reference == id {
                            let referent = unescape_arrobas(referent);
                            if referent == "@bartholomew" {
                                Some(String::from("@bart")) // rename!
                            } else {
                                Some(referent)
                            }
                        } else {
                            None
                        }
                    })
                    .unwrap_or_else(|| String::from("<???>"));

                result.push_str(&replacement);
                read_up_to += close + 1;
            }
            Some(b'[') => {
                let close_header = suffix.find(']').expect("unmatched curly");
                let is_collapsed = match &suffix[close_header - 1..close_header] {
                    "c" => true,
                    "s" => false,
                    other => panic!("bad sigil: {:?}", other),
                };
                let len: usize = suffix[1..close_header - 1]
                    .parse()
                    .expect("bad linkrel len");
                let payload = unescape_arrobas(&suffix[close_header + 1..close_header + 1 + len]);
                match (result.ends_with(&payload), is_collapsed) {
                    (false, _) => result.push_str(&payload),
                    (true, true) => result.push_str("[]"),
                    (true, false) => (),
                };
                read_up_to += close_header + 1 + len;
            }
            other => panic!("bad discriminant: {:?}", other),
        }
    }
    result.push_str(&input[read_up_to..]);
    result
}

#[cfg(test)]
#[test]
fn test_apply_splices() {
    let input = "See @alice's post and [@bartholomew's reply] to @cheryl.";
    let link_text = "[@bartholomew's reply]";
    let linkref_token = format!("@[{}s]{}", link_text.len(), link_text);
    let splices = vec![
        Splice {
            index: input.find("@alice").unwrap(),
            len: "@alice".len(),
            new: String::from("@{user:12345}"),
        },
        Splice {
            index: input.find("@bartholomew").unwrap(),
            len: "@bartholomew".len(),
            new: String::from("@{user:7}"),
        },
        Splice {
            index: input.find("]").unwrap() + 1,
            len: 0,
            new: linkref_token.clone(),
        },
        Splice {
            index: input.find("@cheryl").unwrap(),
            len: "@cheryl".len(),
            new: String::from("@{user:67890}"),
        },
    ];
    let output = apply_splices(&input, &splices);

    assert_eq!(
        output,
        format!(
            "See @{{user:12345}}'s post and [@{{user:7}}'s reply]{} to @{{user:67890}}.",
            linkref_token
        ),
    );
}

fn main() {
    let raw_input = "\
See @alice's [post] and [@bartholomew and @cheryl's reply] for details @ 6PM.

```text
yo we got @alice and @bartholomew in the code block, too
```

See EXAMPLE@EXAMPLE by @bartholomew for more, but don't touch `EXAMPLE @alice @bartholomew`.

[@bartholomew and @cheryl's reply]: https://example.com/1
[post]: https://example.com/2
";

    print!("{}", raw_input);
    let input = escape_arrobas(raw_input);
    println!("\n---\n");
    let splices = parse_splices(&input);
    for splice in &splices {
        println!("{:?}", &splice);
    }
    println!("\n---\n");
    let encoded = apply_splices(&input, &splices);
    print!("{}", encoded);
    println!("\n---\n");
    let decoded = decode(&encoded);
    print!("{}", decoded);
}
	//! Cross-references that are robust to referent renames.
	//!
	//! In a body of Crosswalk text, references can be detected, resolved, and expanded with only
	//! simple string algorithms; Markdown parsing is only needed for encoding. If a referent is
	//! renamed or otherwise updated, no changes are needed to Crosswalk text. The Crosswalk
	//! encoding/decoding cycle is lossless except for the minimal updates needed to reflect changes in
	//! referents.
	//!
	//! Thus, Crosswalk is suited for the following flow:
	//!
	//! - When an author writes Markdown text, encode it into Crosswalk and store the resulting blob
	//! in the database. This requires application-specific, Markdown-aware logic to identify
	//! cross-references and resolve their canonical IDs.
	//!
	//! - When a viewer wants to render that text, use simple string manipulation to identify
	//! references, and then some application-specific, Markdown-unaware logic to resolve those
	//! to referents in the database. With PostgreSQL or similar, this can all be done with a
	//! simple user-defined function within the database layer. More generally, it can be done on
	//! the application backend at the cost of an extra database query: one to fetch the Crosswalk
	//! text, and one more to fetch data for all referents. The result is Markdown source; render
	//! it as desired.
	//!
	//! - When the author wants to edit that text, decode the Crosswalk as before, and present the
	//! Markdown source for editing instead of rendering it.
	//!
	//! It's called Crosswalk because it makes it easy to walk occurrences of cross-references.
	//!
	//! # Encoding
	//!
	//! Crosswalk has one metacharacter, `@`. Tokens are:
	//!
	//! - `@z`: Literal `@` in the source (anywhere, including in code blocks, link labels, etc.).
	//! - `@{REF}`: Cross-reference. The string `REF` should match `[^@{}]*`.
	//! - `@[HEADER]LOOKBEHIND`: Special affordance for Markdown shortcut/collapsed links; see below.
	//!
	//! So, this Markdown document:
	//!
	//! ```markdown
	//! See [@alice's post about SETI@home][seti@home] for details.
	//!
	//! [seti@home]: https://user@example.com/index.html
	//! ```
	//!
	//! ...might be stored at rest as this Crosswalk text:
	//!
	//! ```markdown
	//! See [@{user:123}'s post about SETI@zhome][seti@zhome] for details.
	//!
	//! [seti@zhome]: https://user@zexample.com/index.html
	//! ```
	//!
	//! Finding references is as simple as finding all matches of the regex `@{([^}]*)}`, and then the
	//! IDs (`user:123`) can be parsed according to the application semantics.
	//!
	//! The lexical syntax is chosen to be concise, evocative, and readable. As a bonus, it is also basically
	//! Markdown-compatible, even with extensions that auto-render `user@example.com` as mailto links,
	//! since `user@zexample.com` has the same syntactic structure and is still a valid email address.
	//!
	//! ## Shortcut and collapsed links
	//!
	//! The only tricky bit involves Markdown shortcut links and collapsed links, like `[LINK]` and
	//! `[LINK][]`, respectively. The problem here is that the `LINK` text is reinterpreted in two
	//! contexts: as an inline fragment that's actually rendered, and as a link label. Cross-references
	//! should be resolved in the rendered text, but it's not clear that they should be resolved in
	//! link labels, and most Markdown parsers don't make it easy to work with link labels, anyway: the
	//! syntax trees that they emit are not concrete enough.
	//!
	//! The third token type addresses this issue. This Markdown document:
	//!
	//! ```markdown
	//! Visit [@alice's blog]!
	//!
	//! [@alice's blog]: ...
	//! ```
	//!
	//! ...might encode to:
	//!
	//! ```markdown
	//! Visit [@{user:123}'s blog]@[16s][@zalice's blog]!
	//!
	//! [@zalice's blog]: ...
	//! ```
	//!
	//! The sequence `@[16s][@zalice's blog]` is one Crosswalk token. It means: "to expand this token,
	//! if the string `[@alice's blog]` has just been emitted to the output stream, then emit nothing
	//! (for shortcut links) or `[]` (for collapsed links); otherwise, emit `[@alice's blog]`". The
	//! effect is that if `@alice` is renamed to `@alicia`, then the document decodes to:
	//!
	//! ```markdown
	//! Visit [@alicia's blog][@alice's blog]!
	//!
	//! [@alice's blog]: ...
	//! ```
	//!
	//! ...which renders with the updated names and is structurally consistent with the original
	//! source, without the need to directly inspect or modify the link reference definitions.
	//!
	//! More precisely, such a token is composed of:
	//!
	//! - a literal `@[` sequence introducing the token;
	//! - an ASCII decimal number specifying the number of bytes in the UTF-8 encoding of the
	//! payload;
	//! - one of the characters `s` (for "shortcut") or `c` (for "collapsed"), indicating the link
	//! type;
	//! - a literal `]` character; and
	//! - the raw payload, with semantics as described above.
	//!
	//! # Grammar
	//!
	//! ```none
	//! document ::= token*
	//! token ::= literal \| expansion \| linkref
	//!
	//! raw ::= any single Unicode scalar value EXCEPT "@"
	//! escape = "@z"
	//!
	//! literal ::= raw \| escape
	//!
	//! expansion ::= "@{" (target: reference) "}"
	//! reference ::= any string matching /[^@{}]*/
	//!
	//! linkref ::= "@[" (n: usize) linkref-sigil "]" (lookback: literal*)
	//! (where the UTF-8 length of `lookback` is `n`)
	//! linkref-sigil ::= sigil-shortcut \| sigil-collapsed
	//! sigil-shortcut = "s"
	//! sigil-collapsed = "c"
	//! ```
	//!
	//! # Decoding semantics
	//!
	//! To decode a document, decode all tokens consecutively. To decode a token, if the token is...
	//!
	//! - a raw character `c`: emit `c`;
	//! - an `@z` escape: emit `"@"`;
	//! - an expansion with reference `ref`: look up and emit the referent: e.g., if `REF` is
	//! `"user:123"`, this might emit `"@alice"`;
	//! - a link reference with kind sigil `k` and payload `p`:
	//! - if `p` is not a suffix of the stream of emitted characters, emit `p`; else
	//! - if `k` is `"s"` (shortcut link), emit nothing; else
	//! - if `k` is `"c"` (collapsed link), emit `"[]"`.

	use pulldown_cmark::{Event, LinkType, Parser, Tag};

	#[derive(Debug)]
	struct Splice {
	index: usize,
	len: usize,
	new: String,
	}

	fn escape_arrobas(s: &str) -> String {
	s.replace('@', "@z")
	}

	fn unescape_arrobas(s: &str) -> String {
	s.replace("@z", "@")
	}

	const SAMPLE_PATTERNS: &[(&str, &str)] = &[
	("@zalice", "user:12345"),
	("@zbartholomew", "user:7"),
	("@zcheryl", "user:67890"),
	("EXAMPLE", "ex"),
	];

	fn parse_splices(s: &str) -> Vec<Splice> {
	let mut result = Vec::new();
	let mut in_code_block = false;
	for (event, loc) in Parser::new(s).into_offset_iter() {
	match event {
	Event::Start(Tag::CodeBlock(_)) => in_code_block = true,
	Event::End(Tag::CodeBlock(_)) => in_code_block = false,
	Event::Text(text) if !in_code_block => {
	// stub logic for identifying and canonicalizing cross-references
	for (index, _) in text.char_indices() {
	for &(needle, replacement) in SAMPLE_PATTERNS {
	if !text[index..].starts_with(needle) {
	continue;
	}
	result.push(Splice {
	index: index + loc.start,
	len: needle.len(),
	new: format!("@{{{}}}", replacement),
	});
	}
	}
	}
	Event::End(Tag::Link(link_type, _, _)) => {
	let link_text_source = &s[loc.clone()];
	// If this link text contained no cross-references, nothing to do here.
	if result.last().map_or(true, \|s\| !loc.contains(&s.index)) {
	continue;
	}
	let (kind, replace_len) = match link_type {
	LinkType::Shortcut => ('s', 0),
	LinkType::Collapsed => ('c', 2),
	_ => continue,
	};
	result.push(Splice {
	index: loc.end,
	len: replace_len,
	new: format!("@[{}{}]{}", link_text_source.len(), kind, link_text_source),
	})
	}
	_ => (),
	}
	}
	result
	}

	fn apply_splices(input: &str, splices: &[Splice]) -> String {
	let mut result = String::new();
	let mut read_up_to: usize = 0;
	for splice in splices {
	result.push_str(&input[read_up_to..splice.index]);
	result.push_str(&splice.new);
	read_up_to = splice.index + splice.len;
	}
	result.push_str(&input[read_up_to..]);
	result
	}

	fn decode(input: &str) -> String {
	let mut result = String::new();
	let mut read_up_to: usize = 0;
	for (i, _) in input.match_indices('@') {
	if i < read_up_to {
	// like the "@z" in "@[7s][@zaha]"
	continue;
	}
	result.push_str(&input[read_up_to..i]);
	read_up_to = i + 1;
	let suffix = &input[i + 1..];
	match suffix.as_bytes().first() {
	Some(b'z') => {
	result.push('@');
	read_up_to += 1;
	}
	Some(b'{') => {
	let close = suffix.find('}').expect("unmatched curly");
	let reference = &suffix[1..close];

	// stub logic for resolving a referent
	let replacement = SAMPLE_PATTERNS
	.into_iter()
	.find_map(\|&(referent, id)\| {
	if reference == id {
	let referent = unescape_arrobas(referent);
	if referent == "@bartholomew" {
	Some(String::from("@bart")) // rename!
	} else {
	Some(referent)
	}
	} else {
	None
	}
	})
	.unwrap_or_else(\|\| String::from("<???>"));

	result.push_str(&replacement);
	read_up_to += close + 1;
	}
	Some(b'[') => {
	let close_header = suffix.find(']').expect("unmatched curly");
	let is_collapsed = match &suffix[close_header - 1..close_header] {
	"c" => true,
	"s" => false,
	other => panic!("bad sigil: {:?}", other),
	};
	let len: usize = suffix[1..close_header - 1]
	.parse()
	.expect("bad linkrel len");
	let payload = unescape_arrobas(&suffix[close_header + 1..close_header + 1 + len]);
	match (result.ends_with(&payload), is_collapsed) {
	(false, _) => result.push_str(&payload),
	(true, true) => result.push_str("[]"),
	(true, false) => (),
	};
	read_up_to += close_header + 1 + len;
	}
	other => panic!("bad discriminant: {:?}", other),
	}
	}
	result.push_str(&input[read_up_to..]);
	result
	}

	#[cfg(test)]
	#[test]
	fn test_apply_splices() {
	let input = "See @alice's post and [@bartholomew's reply] to @cheryl.";
	let link_text = "[@bartholomew's reply]";
	let linkref_token = format!("@[{}s]{}", link_text.len(), link_text);
	let splices = vec![
	Splice {
	index: input.find("@alice").unwrap(),
	len: "@alice".len(),
	new: String::from("@{user:12345}"),
	},
	Splice {
	index: input.find("@bartholomew").unwrap(),
	len: "@bartholomew".len(),
	new: String::from("@{user:7}"),
	},
	Splice {
	index: input.find("]").unwrap() + 1,
	len: 0,
	new: linkref_token.clone(),
	},
	Splice {
	index: input.find("@cheryl").unwrap(),
	len: "@cheryl".len(),
	new: String::from("@{user:67890}"),
	},
	];
	let output = apply_splices(&input, &splices);

	assert_eq!(
	output,
	format!(
	"See @{{user:12345}}'s post and [@{{user:7}}'s reply]{} to @{{user:67890}}.",
	linkref_token
	),
	);
	}

	fn main() {
	let raw_input = "\
	See @alice's [post] and [@bartholomew and @cheryl's reply] for details @ 6PM.

	```text
	yo we got @alice and @bartholomew in the code block, too
	```

	See EXAMPLE@EXAMPLE by @bartholomew for more, but don't touch `EXAMPLE @alice @bartholomew`.

	[@bartholomew and @cheryl's reply]: https://example.com/1
	[post]: https://example.com/2
	";

	print!("{}", raw_input);
	let input = escape_arrobas(raw_input);
	println!("\n---\n");
	let splices = parse_splices(&input);
	for splice in &splices {
	println!("{:?}", &splice);
	}
	println!("\n---\n");
	let encoded = apply_splices(&input, &splices);
	print!("{}", encoded);
	println!("\n---\n");
	let decoded = decode(&encoded);
	print!("{}", decoded);
	}