Skip to content

Instantly share code, notes, and snippets.

@benaryorg
Last active October 5, 2023 18:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benaryorg/9d4f2aec58c35a06adf3d61bfd9eec7a to your computer and use it in GitHub Desktop.
Save benaryorg/9d4f2aec58c35a06adf3d61bfd9eec7a to your computer and use it in GitHub Desktop.
Markdown parsing (link and parenthesis)
// see fediverse thread: https://astolfo.social/notes/9kgqfhd4f9gqlh39
//
// Written by @benaryorg with the idea of enforcing balanced parens in links, following guideline:
//
// I think the easiest, best, and most incorrect solution would be to enforce balanced parens in the links.
use ::
{
nom::
{
IResult,
Finish,
bytes::complete::
{
tag,
},
character::complete::
{
anychar,
char,
one_of,
none_of,
alphanumeric1,
},
combinator::
{
complete,
map,
recognize,
opt,
},
multi::
{
many0,
many1,
},
branch::
{
alt,
},
sequence::
{
delimited,
},
},
std::
{
io::stdin,
},
};
#[derive(Debug, Hash, Clone, PartialEq, Eq, PartialOrd, Ord)]
enum Markdown
{
Char(char),
Link
{
text: String,
url: String,
}
}
fn markdown_link_ref_balanced_parens(input: &str) -> IResult<&str, ()>
{
// does not support:
// unicode
let (input, _) = many1(alt(
( alphanumeric1
, recognize(one_of("-._~:/?#[]@!$&'*+,;%= "))
, recognize(delimited(char('('), opt(markdown_link_ref_balanced_parens), char(')')))
)))(input)?;
Ok((input, ()))
}
fn markdown_link_ref(input: &str) -> IResult<&str, ()>
{
// schema
let (input, _) = alt((tag("https://"), tag("http://")))(input)?;
// domain
// does not support IDN
// does not validate for multiple of: '@', ':', etc.
let (input, _) = many1(alt((alphanumeric1, recognize(one_of("-.:@")))))(input)?;
// path
let (input, _) = opt(markdown_link_ref_balanced_parens)(input)?;
Ok((input, ()))
}
fn markdown_link(input: &str) -> IResult<&str, Markdown>
{
let (input, _) = char('[')(input)?;
let (input, text) = recognize(many1(none_of("\\]")))(input)?;
let (input, _) = char(']')(input)?;
let (input, _) = char('(')(input)?;
let (input, url) = recognize(markdown_link_ref)(input)?;
let (input, _) = char(')')(input)?;
Ok((input, Markdown::Link
{
text: text.into(),
url: url.into(),
}))
}
fn markdown_escape(input: &str) -> IResult<&str, Markdown>
{
let (input, _) = char('\\')(input)?;
map(anychar, Markdown::Char)(input)
}
fn markdown_element(input: &str) -> IResult<&str, Markdown>
{
alt(
( markdown_escape
, markdown_link
// always fall backto reading raw chars, this way the parser cannot hard-fail it will just produce plaintext
, |input| map(anychar, Markdown::Char)(input)
)
)(input)
}
fn markdown(input: &str) -> IResult<&str, Vec<Markdown>>
{
many0(markdown_element)(input)
}
fn main()
{
for line in stdin().lines().take_while(Result::is_ok).map(Result::unwrap)
{
match complete(markdown)(&line).finish()
{
Ok((_, out)) => println!("{:?}", out),
Err(err) => eprintln!("{:?}", err),
}
}
}
#[cfg(test)]
mod test
{
#[test]
fn parse_markdown()
{
use crate::{markdown, Markdown};
// no parens in link with parens around text
assert_eq!(markdown("foo (see: [something](https://en.wikipedia.org/wiki/Text)) bar"), Ok(("",
vec!
// opening text, parens, and more text
[ Markdown::Char('f'), Markdown::Char('o'), Markdown::Char('o'), Markdown::Char(' '), Markdown::Char('('), Markdown::Char('s'), Markdown::Char('e'), Markdown::Char('e'), Markdown::Char(':'), Markdown::Char(' ')
// link (no parens)
, Markdown::Link { text: "something".into(), url: "https://en.wikipedia.org/wiki/Text".into(), }
// closing parens, and following text
, Markdown::Char(')'), Markdown::Char(' '), Markdown::Char('b'), Markdown::Char('a'), Markdown::Char('r')
])));
// balanced parens in link with parens around text
assert_eq!(markdown("foo (see: [something](https://en.wikipedia.org/wiki/Text_(literary_theory))) bar"), Ok(("",
vec!
// opening text, parens, and more text
[ Markdown::Char('f'), Markdown::Char('o'), Markdown::Char('o'), Markdown::Char(' '), Markdown::Char('('), Markdown::Char('s'), Markdown::Char('e'), Markdown::Char('e'), Markdown::Char(':'), Markdown::Char(' ')
// link (balanced parens)
, Markdown::Link { text: "something".into(), url: "https://en.wikipedia.org/wiki/Text_(literary_theory)".into(), }
// closing parens, and following text
, Markdown::Char(')'), Markdown::Char(' '), Markdown::Char('b'), Markdown::Char('a'), Markdown::Char('r')
])));
// trailing parens, balanced and recognized
assert_eq!(markdown("foo (see: [something](https://en.wikipedia.org/wiki/Text()) bar"), Ok(("",
vec!
// opening text, parens, and more text
[ Markdown::Char('f'), Markdown::Char('o'), Markdown::Char('o'), Markdown::Char(' '), Markdown::Char('('), Markdown::Char('s'), Markdown::Char('e'), Markdown::Char('e'), Markdown::Char(':'), Markdown::Char(' ')
// link (balanced parens)
, Markdown::Link { text: "something".into(), url: "https://en.wikipedia.org/wiki/Text()".into(), }
// closing parens, and following text
, Markdown::Char(' '), Markdown::Char('b'), Markdown::Char('a'), Markdown::Char('r')
])));
// trailing parens, unbalanced and ignored
assert_eq!(markdown("[something](https://en.wikipedia.org/wiki/Halting_problem))"), Ok(("",
vec!
// link (balanced parens, meaning no parens)
[ Markdown::Link { text: "something".into(), url: "https://en.wikipedia.org/wiki/Halting_problem".into(), }
// closing parens which doesn't match the link
, Markdown::Char(')')
])));
// spaces in link
assert_eq!(markdown("[something](https://en.wikipedia.org/wiki/Among Us)"), Ok(("",
vec!
// link (balanced parens, meaning no parens), does include space
[ Markdown::Link { text: "something".into(), url: "https://en.wikipedia.org/wiki/Among Us".into(), }
])));
// spaces and unbalanced parens
assert_eq!(markdown("[something](https://en.wikipedia.org/wiki/Amogus) Us)"), Ok(("",
vec!
// link (balanced parens, meaning no parens), does not include space
[ Markdown::Link { text: "something".into(), url: "https://en.wikipedia.org/wiki/Amogus".into(), }
// trailing " Us)" that doesn't belong to the link
, Markdown::Char(' '), Markdown::Char('U'), Markdown::Char('s'), Markdown::Char(')')
])));
// parens and spaces
assert_eq!(markdown("[something](https://en.wikipedia.org/wiki/Amogus (meme)) Us)"), Ok(("",
vec!
// link (balanced parens, meaning it contains the "(meme)"), does include space
[ Markdown::Link { text: "something".into(), url: "https://en.wikipedia.org/wiki/Amogus (meme)".into(), }
// trailing " Us)" that doesn't belong to the link
, Markdown::Char(' '), Markdown::Char('U'), Markdown::Char('s'), Markdown::Char(')')
])));
// the failing `(` Wikipedia article
assert_eq!(markdown("[something](https://en.wikipedia.org/wiki/()"), Ok(("",
vec!
// turns into plaintext, the link name first
[ Markdown::Char('['), Markdown::Char('s'), Markdown::Char('o'), Markdown::Char('m'), Markdown::Char('e'), Markdown::Char('t'), Markdown::Char('h'), Markdown::Char('i'), Markdown::Char('n'), Markdown::Char('g'), Markdown::Char(']')
// link ref next
, Markdown::Char('('), Markdown::Char('h'), Markdown::Char('t'), Markdown::Char('t'), Markdown::Char('p'), Markdown::Char('s'), Markdown::Char(':'), Markdown::Char('/'), Markdown::Char('/'), Markdown::Char('e'), Markdown::Char('n'), Markdown::Char('.'), Markdown::Char('w'), Markdown::Char('i'), Markdown::Char('k'), Markdown::Char('i'), Markdown::Char('p'), Markdown::Char('e'), Markdown::Char('d'), Markdown::Char('i'), Markdown::Char('a'), Markdown::Char('.'), Markdown::Char('o'), Markdown::Char('r'), Markdown::Char('g'), Markdown::Char('/'), Markdown::Char('w'), Markdown::Char('i'), Markdown::Char('k'), Markdown::Char('i'), Markdown::Char('/'), Markdown::Char('('), Markdown::Char(')'),
])));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment