Skip to content

Instantly share code, notes, and snippets.

@ironhouzi
Last active September 15, 2018 11:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ironhouzi/6b539acfdee91d74a01b31b03481fcca to your computer and use it in GitHub Desktop.
Save ironhouzi/6b539acfdee91d74a01b31b03481fcca to your computer and use it in GitHub Desktop.
pub mod table;
use std::collections::HashMap;
use std::char;
#[cfg(test)]
mod tests {
use super::maybe_sanskrit;
use super::tokenize;
use super::get_root;
use super::default_parse;
// use super::to_unicode;
// use super::vowel_indices;
// use super::analyze_root;
// use super::Letter;
use super::Slice;
use super::LetterType;
use super::Word;
use table;
// #[test]
// fn test_root_analyzer() {
// let parts = tokenize("a", &table::W_SORTED_ALPHABET);
// assert_eq!(analyze_root("a", parts, 0), vec![Syllable::Root])
// }
#[test]
fn test_get_root() {
assert_eq!(
get_root(&default_parse("a")),
vec![&LetterType::Root]
);
assert_eq!(
get_root(&default_parse("ba")),
vec![&LetterType::Root, &LetterType::Vowel]
);
assert_eq!(
get_root(&default_parse("bya")),
vec![&LetterType::Root, &LetterType::Subjoined, &LetterType::Vowel]
);
assert_eq!(
get_root(&default_parse("rja")),
vec![&LetterType::Super, &LetterType::Root, &LetterType::Vowel]
);
assert_eq!(
get_root(&default_parse("g.ya")),
vec![&LetterType::Prefix, &LetterType::Root, &LetterType::Vowel]
);
assert_eq!(
get_root(&default_parse("dba")),
vec![&LetterType::Prefix, &LetterType::Root, &LetterType::Vowel]
);
assert_eq!(
get_root(&default_parse("srwa")),
vec![&LetterType::Root, &LetterType::Subjoined,
&LetterType::Subjoined2, &LetterType::Vowel]
);
assert_eq!(
get_root(&default_parse("bsnga")),
vec![&LetterType::Prefix, &LetterType::Super,
&LetterType::Root, &LetterType::Vowel]
);
assert_eq!(
get_root(&default_parse("dbya")),
vec![&LetterType::Prefix, &LetterType::Root,
&LetterType::Subjoined, &LetterType::Vowel]
);
assert_eq!(
get_root(&default_parse("skya")),
vec![&LetterType::Super, &LetterType::Root,
&LetterType::Subjoined, &LetterType::Vowel]
);
assert_eq!(
get_root(&default_parse("bskya")),
vec![&LetterType::Prefix, &LetterType::Super, &LetterType::Root,
&LetterType::Subjoined, &LetterType::Vowel]
);
}
#[test]
fn test_letter_slice() {
let s = "sangs";
let w = tokenize(&s, &table::W_SORTED_ALPHABET);
assert_eq!(w.vowels[0], 1);
assert_eq!(w.letter(0), "s");
assert_eq!(w.letter(1), "a");
assert_eq!(w.letter(2), "ng");
assert_eq!(w.letter(3), "s");
}
#[test]
fn test_letter_partition() {
let mut w = "sangs";
assert_eq!(
tokenize(w, &table::W_SORTED_ALPHABET),
Word {
string: w,
vowels: vec![1],
letters: vec![
Slice{i: 0, len: 1},
Slice{i: 1, len: 1},
Slice{i: 2, len: 2},
Slice{i: 4, len: 1}
],
index: 0
}
);
w = "'tshags";
assert_eq!(
tokenize(w, &table::W_SORTED_ALPHABET),
// vec!["'", "tsh", "a", "g", "s"]);
Word {
string: w,
vowels: vec![2],
letters: vec![
Slice{i: 0, len: 1},
Slice{i: 1, len: 3},
Slice{i: 4, len: 1},
Slice{i: 5, len: 1},
Slice{i: 6, len: 1}
],
index: 0
}
);
w = "g.yag";
assert_eq!(
tokenize(w, &table::W_SORTED_ALPHABET),
Word {
string: w,
vowels: vec![2],
letters: vec![
Slice{i: 0, len: 2},
Slice{i: 2, len: 1},
Slice{i: 3, len: 1},
Slice{i: 4, len: 1}
],
index: 0
}
);
}
#[test]
fn test_quickcheck() {
let sanskrit: [&'static str; 6] = ["sarva", "ai", "au", "akṣye", "vajra", "kyai"];
for s in &sanskrit {
assert!(maybe_sanskrit(s));
}
}
#[test]
fn test_foo() {
let w = tokenize("g.yag", &table::W_SORTED_ALPHABET);
for l in w {
print!("`{}`, ", l);
}
println!();
}
// #[test]
// fn test_unicode() {
// // assert_eq!(to_unicode("e", LetterType::Root), Some('a'));
// let s = to_unicode("s", LetterType::Root);
// assert_eq!(s, Some('a'));
// }
}
// #[derive(Copy, Clone, Debug, PartialEq)]
#[derive(Debug, PartialEq)]
enum LetterType {
Vowel,
Prefix,
Super,
Root,
Subjoined,
Subjoined2,
// Suffix,
// Suffix2,
// Genitive,
// GenVowel
}
#[derive(Debug, PartialEq)]
struct Slice {
i: usize,
len: usize,
}
#[derive(Debug, PartialEq)]
struct Letter {
slice: Slice,
category: LetterType
}
#[derive(Debug, PartialEq)]
struct WordInfo {
root: Vec<usize>,
letters: Vec<Slice>
}
#[derive(Debug, PartialEq)]
struct Word<'a> {
string: &'a str,
vowels: Vec<usize>,
letters: Vec<Slice>,
index: usize
}
#[derive(Debug, PartialEq)]
struct ParsedWord<'a> {
letters: Vec<&'a str>,
structure: Vec<&'a LetterType>,
index: usize
}
impl<'a> Word<'a> {
pub fn letter(&'a self, index: usize) -> &'a str {
w_letter(self.string, &self.letters[index])
}
// pub fn to_unicode(self) -> String {
// // self.letters.map(|l| l.to_unicode()).collect();
// "".to_string()
// }
}
impl<'a> Iterator for ParsedWord<'a> {
type Item = (&'a str, &'a LetterType);
fn next(&mut self) -> Option<Self::Item> {
if self.index >= self.letters.len() {
return None
}
let result = (self.letters[self.index], self.structure[self.index]);
self.index += 1;
Some(result)
}
}
impl<'a> Iterator for Word<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
if self.index >= self.letters.len() {
return None
}
let slice = &self.letters[self.index];
let result = &self.string[slice.i..slice.i+slice.len];
self.index += 1;
Some(result)
}
}
// impl<'a> Iterator for Word<'a> {
// type Item = &'a str;
// fn next(&mut self) -> Option<Self::Item> {
// if self.index >= self.letters.len() {
// return None
// }
// let result = self.letter(self.index);
// self.index += 1;
// Some(result)
// }
// }
// impl<'a> Iterator for ParsedWord<'a> {
// type Item = (&'a str, LetterType);
// fn next(&mut self) -> Option<(&'a str, LetterType)> {
// Some(("s", LetterType::Root))
// // word.word.letters.iter().zip(word.structure)
// }
// }
// // TODO: conjoin neighbouring vowels to count as one vowel..
// fn vowel_indices(string: &str, vowels: &[char]) -> Vec<usize> {
// let indices: Vec<usize> = string.chars()
// .enumerate()
// .filter(|&(_, c)| vowels.contains(&c))
// .map(|(i, _)| i)
// .collect();
// indices
// }
// fn letter(string: &str, slice: (usize, usize)) -> &str {
// &string[slice.0..slice.0+slice.1]
// }
// fn analyze_root<'a>(string: &str, parts: &'a Vec<Letter>) -> Vec<Letter> {
// }
// fn get_root(string: &str, vowel_indices: Vec<usize>, slices: Vec<Slice>) -> Vec<LetterType> {
fn get_root<'a>(word: &'a Word) -> Vec<&'a LetterType> {
let mut result: Vec<&'a LetterType> = Vec::new();
if word.vowels[0] == 0 {
result.push(&LetterType::Root);
return result;
} else if word.vowels[0] == 1 {
if table::W_CONSONANTS.contains(&word.letter(0)) {
result.push(&LetterType::Root);
} // TODO: raise error on else
} else if word.vowels[0] == 2 {
if is_subscribed(&word) {
result.push(&LetterType::Root);
result.push(&LetterType::Subjoined);
} else if is_superscribed(&word) {
result.push(&LetterType::Super);
result.push(&LetterType::Root);
} else if table::PREFIXES.contains(&word.letter(0))
&& table::W_CONSONANTS.contains(&word.letter(1)) {
result.push(&LetterType::Prefix);
result.push(&LetterType::Root);
}
} else if word.vowels[0] == 3 {
if word.letter(2) == "w" && word.letter(1) == "r" {
result.push(&LetterType::Root);
result.push(&LetterType::Subjoined);
result.push(&LetterType::Subjoined2);
} else if is_superscribed(&word) {
result.push(&LetterType::Prefix);
result.push(&LetterType::Super);
result.push(&LetterType::Root);
} else if is_subscribed(&word) {
result.push(&LetterType::Prefix);
result.push(&LetterType::Root);
result.push(&LetterType::Subjoined);
} else if table::SUPERJOINED.contains(&word.letter(0))
&& table::W_CONSONANTS.contains(&word.letter(1))
&& table::SUBJOINED.contains(&word.letter(2)) {
result.push(&LetterType::Super);
result.push(&LetterType::Root);
result.push(&LetterType::Subjoined);
}
} else if word.vowels[0] == 4 {
if !(table::PREFIXES.contains(&word.letter(0))
&& table::SUPERJOINED.contains(&word.letter(1))
&& table::W_CONSONANTS.contains(&word.letter(2))
&& table::SUBJOINED.contains(&word.letter(3))) {
// TODO raise error!
}
result.push(&LetterType::Prefix);
result.push(&LetterType::Super);
result.push(&LetterType::Root);
result.push(&LetterType::Subjoined);
}
result.push(&LetterType::Vowel);
result
}
fn generate_lookup<'a, 'b>() -> HashMap<&'b&'a str, &'b&'a str> {
let mut lookup = HashMap::new();
for (l, u) in table::W_CONSONANTS.iter().chain(table::W_VOWELSS.iter()).zip(
table::U_CONSONANTS.iter().chain(table::U_VOWELS.iter())) {
lookup.insert(l, u);
}
lookup
}
fn foo<'a, 'b>() -> HashMap<&'b &'a str, &'b u32> {
let mut lookup = HashMap::new();
for (l, u) in table::W_CONSONANTS.iter().chain(table::W_VOWELSS.iter()).zip(
table::U_CONSONANTSI.iter().chain(table::U_VOWELSI.iter())) {
lookup.insert(l, u);
}
lookup
}
fn create_parsed_word<'a>(string: &'a str) -> ParsedWord<'a> {
let word = default_parse(string);
let structure = get_root(&word);
let letters = word.collect();
ParsedWord {letters: letters, structure: structure, index: 0}
}
fn subjoin_unicode<'a, 'b>(codepoint: Option<&'a &'b u32>) -> Option<char>{
match codepoint {
Some(cp) => char::from_u32(*cp + 0x50),
None => return None
}
}
// fn to_unicode<'a>(word: ParsedWord) -> Option<String> {
// // let lookup = generate_lookup();
// let lookup = foo();
// let mut result: Vec<char> = Vec::new();
// for (letter, letter_type) in word.word.letters.iter().zip(word.structure) {
// if letter == "a" && letter_type == LetterType::Root {
// continue
// }
// if table::W_VOWELSS.contains(letter) && letter_type == LetterType::Root {
// match lookup.get(letter) {
// Some(l) => result.push(l),
// None => return None
// }
// }
// }
// Some("a".to_string())
// }
fn w_letter<'a>(string: &'a str, slice: &'a Slice) -> &'a str {
&string[slice.i..slice.i+slice.len]
}
fn default_parse<'a>(string: &'a str) -> Word<'a> {
tokenize(&string, &table::W_SORTED_ALPHABET)
}
fn tokenize<'a>(string: &'a str, alphabet: &[&'static str]) -> Word<'a> {
let mut result: Vec<Slice> = Vec::new();
let mut vowel_indices: Vec<usize> = Vec::new();
let mut progress = 0;
while progress < string.len() {
for (i, letter) in alphabet.iter().enumerate() {
let slice = &string[progress..];
let g_prefix_edge_case = slice.starts_with("g.");
if !(g_prefix_edge_case || slice.starts_with(letter)) {
if i == alphabet.len() - 1 {
// TODO: raise exception invalid tibetan character!
progress = string.len();
}
continue;
}
let letter_length =
if g_prefix_edge_case {
2
} else {
letter.len()
};
result.push(Slice{i: progress, len: letter_length});
if i >= table::W_SORTED_ALPHABET.len() - table::TIBETAN_VOWELS.len() {
vowel_indices.push(result.len() - 1)
}
progress += letter_length;
break;
}
}
Word {string: string, vowels: vowel_indices, letters: result, index: 0}
}
fn maybe_sanskrit(string: &str) -> bool {
if string.len() == 3 && table::S_DOUBLE_CONSONANTS.contains(&&string[0..2]) {
return true;
}
for r in &table::S_BASIC_RULES {
if string.starts_with(r) {
return true;
}
}
if string.contains("ai") || string.contains("au") {
return true;
}
let mut vowel_count = 0;
for v in &table::TIBETAN_VOWELS {
let m: Vec<&str> = string.matches(&v.to_string()).collect();
vowel_count += m.len();
}
// achung
!string.contains(table::W_CONSONANTS[22]) && vowel_count > 1
}
// fn is_subscribed(string: &str, vowel_index: usize, slices: &Vec<Slice>) -> bool {
fn is_subscribed(word: &Word) -> bool {
if word.vowels[0] == 2 {
!valid_superscribe(word.letter(0), word.letter(1))
&& valid_subscribe(word.letter(0), word.letter(1))
} else { // vowel_index == 3
table::PREFIXES.contains(&word.letter(0))
&& !valid_superscribe(word.letter(1), word.letter(2))
&& valid_subscribe(word.letter(1), word.letter(2))
}
}
// fn is_superscribed(string: &str, vowel_index: usize, slices: &Vec<Slice>) -> bool {
fn is_superscribed(word: &Word) -> bool {
if word.vowels[0] == 2 {
valid_superscribe(word.letter(0), word.letter(1))
&& !valid_subscribe(word.letter(0), word.letter(1))
} else { // vowel_index == 3
table::PREFIXES.contains(&word.letter(0))
&& valid_superscribe(word.letter(1), word.letter(2))
&& !valid_subscribe(word.letter(1), word.letter(2))
}
}
fn valid_superscribe(head_letter: &str, root_letter: &str) -> bool {
table::SUPERJOINED.contains(&head_letter)
&& table::SUPERJOINABLE.contains(&root_letter)
}
fn valid_subscribe(root_letter: &str, subjoined_letter: &str) -> bool {
table::SUBJOINED.contains(&subjoined_letter)
&& table::SUBJOINABLE.contains(&root_letter)
}
@ironhouzi
Copy link
Author

error[E0597]: `word` does not live long enough
   --> src/lib.rs:391:31
    |
391 |     let structure = get_root(&word);
    |                               ^^^^ borrowed value does not live long enough
...
394 | }
    | - borrowed value only lives until here
    |
note: borrowed value must be valid for the lifetime 'a as defined on the function body at 389:1...
   --> src/lib.rs:389:1
    |
389 | fn create_parsed_word<'a>(string: &'a str) -> ParsedWord<'a> {
    | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

error: aborting due to previous error

For more information about this error, try `rustc --explain E0597`.
error: Could not compile `rustib`.
warning: build failed, waiting for other jobs to finish...
error[E0597]: `word` does not live long enough
   --> src/lib.rs:391:31
    |
391 |     let structure = get_root(&word);
    |                               ^^^^ borrowed value does not live long enough
...
394 | }
    | - borrowed value only lives until here
    |
note: borrowed value must be valid for the lifetime 'a as defined on the function body at 389:1...
   --> src/lib.rs:389:1
    |
389 | fn create_parsed_word<'a>(string: &'a str) -> ParsedWord<'a> {
    | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

error: aborting due to previous error

For more information about this error, try `rustc --explain E0597`.
error: Could not compile `rustib`.

To learn more, run the command again with --verbose.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment