Skip to content

Instantly share code, notes, and snippets.

@pcranaway
Created August 26, 2023 03:16
Show Gist options
  • Save pcranaway/0451965380fcbac1e21772ba816c868b to your computer and use it in GitHub Desktop.
Save pcranaway/0451965380fcbac1e21772ba816c868b to your computer and use it in GitHub Desktop.
/// Parses an entire Treebank given a list of strings (the file's lines.)
pub fn parse(lines: Vec<String>) {
// for line in lines {
// let line = parse_line(line);
// }
// lines.iter().map(line::parse_line)
}
/// Parsers for the three different kinds of lines.
pub mod line {
#[derive(Debug, PartialEq)]
pub enum Line {
Comment(String),
Boundary,
Word(String),
}
/// Detects what kind of line a given line is ([Line])
pub fn parse_line(input: String) -> Option<Line> {
if is_comment_line(&input) {
// TODO: get rid of `#` and actually parse the contents of the comment (don't confuse
// them with regular programming comments -- they apparently have actual meaning.)
return Some(Line::Comment(input));
}
if is_boundary_line(&input) {
return Some(Line::Boundary);
}
if is_word_line(&input) {
return Some(Line::Word(input));
}
return None;
}
/// Checks if given input is a comment line, starting with a `#`
pub fn is_comment_line(input: &str) -> bool {
return input.starts_with("#");
}
/// Checks if given input is a boundary line, meaning that it separates sentences from
/// eachother. Those lines are empty.
pub fn is_boundary_line(input: &str) -> bool {
return input.trim().is_empty();
}
/// Checks if given input is a word line. This is a little hard to check, so we just check if it
/// contains at least one tab character.
pub fn is_word_line(input: &str) -> bool {
return input.contains("\t");
}
#[cfg(test)]
mod tests {
use super::*;
const TEST_WORD_LINE: &str ="2 Mijn mijn PRON VNW|bez|det|stan|vol|1|ev|prenom|zonder|agr Person=1|Poss=Yes|PronType=Prs 3 nmod:poss 3:nmod:poss _";
#[test]
fn test_comment() {
assert!(is_comment_line("#a"));
assert!(is_comment_line("# aaaaaaaaaaaaaaaaaaaa"));
assert!(!is_comment_line("a"));
}
#[test]
fn test_boundary() {
assert!(is_boundary_line(""));
assert!(is_boundary_line(
"
"
));
}
#[test]
fn test_word_line() {
assert!(is_word_line(TEST_WORD_LINE));
assert!(!is_word_line("# a"));
assert!(!is_word_line("a"));
}
#[test]
fn parse_line_optional() {
assert!(parse_line("# a".to_string()).is_some());
assert!(parse_line("".to_string()).is_some());
assert!(parse_line(TEST_WORD_LINE.to_string()).is_some());
assert!(parse_line("a".to_string()).is_none());
}
#[test]
fn detect_comment() {
// TODO: fix this test as well
assert_eq!(
parse_line("# a".to_string()).unwrap(),
Line::Comment("# a".to_string())
);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment