ArtemGr/icu.rs

## icu.rs
pub mod ICU {
  // --- Hand-made ICU bindings ---

  // $ apt-get install -y libicu-dev
  // To check the argument types:
  // $ apt-get install -y libicu52-dbg
  // $ cd /usr/lib/x86_64-linux-gnu && gdb libicui18n.so
  // ) print utrans_open_52
  // ) ptype UChar
  // ) ptype UParseError

  use libc::{c_int, c_char};
  use libc::funcs::c95::string::strlen;
  use std::fmt;
  use std::ffi::CString;
  use std::iter::repeat;
  use std::mem::transmute;
  use std::ptr;
  use std::raw::Slice;
  use std::str::from_utf8_unchecked;
  #[cfg(test)] use std::slice::from_raw_buf;

  // http://icu-project.org/apiref/icu4c/structUParseError.html
  #[repr(C)] #[derive(Copy)] pub struct UParseError {line: i32, offset: i32, pre_context: u16, post_context: u16}
  #[repr(C)] #[derive(Copy)] pub struct UTransliterator;
  // There's more to it, but we only need the pointer.
  #[allow(dead_code)]#[repr(C)] #[derive(Copy)] pub struct UEnumeration;

  #[link(name="icui18n")] #[allow(dead_code)] extern {
    /// http://icu-project.org/apiref/icu4c/utrans_8h.html#a489c970cc731dac45c3bb5cced4ccb52
    pub fn utrans_openIDs_52 (pErrorCode: *mut c_int) -> *const UEnumeration;
    /// http://icu-project.org/apiref/icu4c/utrans_8h.html#afaa9d751e1d24617ba1071d8d4c887a3
    pub fn utrans_openU_52 (
      // Might be a compound transliterator, like "Latin-Katakana; Katakana-Hiragana" or "Any-Upper; NFD; [:Nonspacing Mark:] Remove; NFC".
      id: *const u16,
      idLength: i32,
      dir: c_int,  // UTRANS_FORWARD = 0, UTRANS_REVERSE = 1.
      rules: *const u16,
      rulesLength: i32,
      parseError: *mut UParseError,
      pErrorCode: *mut c_int
    ) -> *const UTransliterator;
    pub fn utrans_close_52 (ut: *const UTransliterator);
    /// http://icu-project.org/apiref/icu4c/utrans_8h.html#af415d8aa51e79d4494ebb8ef8fc76ae2
    pub fn utrans_transUChars_52 (
      trans: *const UTransliterator,
      text: *mut u16,
      textLength: *mut i32,
      textCapacity: i32,
      start: i32,
      limit: *mut i32,
      status: *mut c_int);}
  #[link(name="icuuc")] #[allow(dead_code)] extern {
    /// http://icu-project.org/apiref/icu4c/uenum_8h.html#a99298eabaa3874cdfd9793b207848f68
    pub fn uenum_next_52 (ids: *const UEnumeration, resultLength: *mut i32, status: *mut i32) -> *const c_char;
    /// http://icu-project.org/apiref/icu4c/uenum_8h.html#af8bf1abcf3a486f07ee3384c7fce89df
    pub fn uenum_close_52 (ids: *const UEnumeration);
    pub fn u_errorName_52 (code: c_int) -> *const c_char;
    pub fn u_strFromUTF8_52 (
      dest: *mut u16,
      destCapacity: i32,
      pDestLength: *mut i32,
      src: *const c_char,
      srcLength: i32,
      pErrorCode: *mut c_int
    ) -> *const u16;}

  // (x)<=U_ZERO_ERROR, http://icu-project.org/apiref/icu4c/utypes_8h.html#a527f2c69e6b2e3b2c53ad8a99fb36711
  pub fn u_success (error_code: c_int) -> bool {error_code <= 0}

  // --- High-level interface ---

  /// http://icu-project.org/apiref/icu4c/utypes_8h.html#a3343c1c8a8377277046774691c98d78c
  #[derive(Copy)] pub struct UErrorCode (c_int);
  impl fmt::Debug for UErrorCode {
    fn fmt (&self, fm: &mut fmt::Formatter) -> Result<(), fmt::Error> {
      let error_message = unsafe {u_errorName_52 (match self {&UErrorCode (i) => i})};
      let error_message = if error_message == ptr::null() {"null"} else {
        let slice = Slice {data: error_message, len: unsafe {strlen (error_message)} as usize};
        unsafe {from_utf8_unchecked (transmute (slice))}};
      try! (write! (fm, "UErrorCode ({})", error_message));
      Ok(())}}

  /// The list of supported transformations, from `utrans_openIDs`.
  #[cfg(test)] pub fn ids() -> Result<Vec<String>, UErrorCode> {
    let mut ec = 0;
    let ids = unsafe {utrans_openIDs_52 (&mut ec)};
    if !u_success (ec) {return Err (UErrorCode (ec))}

    let mut rvec = Vec::new();
    loop {
      let mut len = 0;
      ec = 0;
      let id = unsafe {uenum_next_52 (ids, &mut len, &mut ec)};
      if !u_success (ec) {unsafe {uenum_close_52 (ids)}; return Err (UErrorCode (ec))}
      if id == ptr::null() {break}
      if len > 0 {rvec.push (unsafe {String::from_utf8_lossy (from_raw_buf (&(id as *const u8), len as usize)) .into_owned()});}
    }

    unsafe {uenum_close_52 (ids)};
    Ok (rvec)}

  pub struct Transliterator (*const UTransliterator);
  impl Transliterator {
    pub fn run (&self, text: &str) -> Result<String, UErrorCode> {
      let mut buf = try! (utf8_to_utf16 (text));
      let mut textLength = buf.len() as i32;  // Remember the size of the original text.
      buf.resize (textLength as usize + 32, 0);  // Make some space for the transliterated version in case it's larger than the original.
      {let cap = buf.capacity(); if buf.len() < cap {buf.resize (cap, 0);}}  // If the vector still has some extra space then use it as well.
      let mut limit = textLength;
      let mut ec = 0;
      unsafe {utrans_transUChars_52 (  // http://icu-project.org/apiref/icu4c/utrans_8h.html#af415d8aa51e79d4494ebb8ef8fc76ae2
        match self {&Transliterator (ut) => ut},
        buf.as_mut_slice().as_mut_ptr(),
        &mut textLength,
        buf.len() as i32,
        0,  // start
        &mut limit,
        &mut ec)};
      if u_success (ec) {
        buf.truncate (limit as usize);
        Ok (String::from_utf16_lossy (buf.as_slice()))
      } else {
        Err (UErrorCode (ec))
      }}}
  impl Drop for Transliterator {fn drop (&mut self) {unsafe {utrans_close_52 (match self {&mut Transliterator (ut) => ut})}}}

  pub fn U_BUFFER_OVERFLOW_ERROR() -> c_int {15}

  pub fn utf8_to_utf16 (text: &str) -> Result<Vec<u16>, UErrorCode> {
    let text = CString::from_slice (text.as_bytes());
    let mut buf: Vec<u16> = repeat (0) .take (text.len() + 32) .collect();
    loop {
      let mut ec = 0; let mut len = 0;
      unsafe {u_strFromUTF8_52 (buf.as_mut_slice().as_mut_ptr(), buf.len() as i32, &mut len, text.as_ptr(), text.len() as i32, &mut ec)};
      if u_success (ec) {buf.truncate (len as usize); return Ok (buf)}
      else if ec == U_BUFFER_OVERFLOW_ERROR() && buf.len() < text.len() * 4 + 32 {buf.resize (text.len(), 0); continue}
      else {return Err (UErrorCode (ec))}}}

  /// * `id` - transliterator id, one of `ids`,
  ///          or a compound/filtered one (cf. http://userguide.icu-project.org/transforms/general#TOC-Filtered-IDs).
  /// * `direction` - UTRANS_FORWARD = 0, UTRANS_REVERSE = 1.
  pub fn open_transliterator (id: &str, direction: i8) -> Result<Transliterator, UErrorCode> {
    let id = try! (utf8_to_utf16 (id));
    let mut ec = 0;
    let ut = unsafe {utrans_openU_52 (id[].as_ptr(), id.len() as i32, direction as c_int, ptr::null(), 0, ptr::null_mut(), &mut ec)};
    if u_success (ec) {
      Ok (Transliterator (ut))
    } else {
      if ut != ptr::null() {unsafe {utrans_close_52 (ut)}}
      Err (UErrorCode (ec))
    }}}

#[test] fn icu_transliteration() {
  use by_db::ICU::*;
  assert! (ids().ok().unwrap().iter().find (|id| id.as_slice() == "Russian-Latin/BGN") .is_some());
  let transliterator = open_transliterator ("Russian-Latin/BGN", 0) .ok().unwrap();
  assert_eq! (transliterator.run ("Проверка.") .ok().unwrap().as_slice(), "Proverka.")}
	pub mod ICU {
	// --- Hand-made ICU bindings ---

	// $ apt-get install -y libicu-dev
	// To check the argument types:
	// $ apt-get install -y libicu52-dbg
	// $ cd /usr/lib/x86_64-linux-gnu && gdb libicui18n.so
	// ) print utrans_open_52
	// ) ptype UChar
	// ) ptype UParseError

	use libc::{c_int, c_char};
	use libc::funcs::c95::string::strlen;
	use std::fmt;
	use std::ffi::CString;
	use std::iter::repeat;
	use std::mem::transmute;
	use std::ptr;
	use std::raw::Slice;
	use std::str::from_utf8_unchecked;
	#[cfg(test)] use std::slice::from_raw_buf;

	// http://icu-project.org/apiref/icu4c/structUParseError.html
	#[repr(C)] #[derive(Copy)] pub struct UParseError {line: i32, offset: i32, pre_context: u16, post_context: u16}
	#[repr(C)] #[derive(Copy)] pub struct UTransliterator;
	// There's more to it, but we only need the pointer.
	#[allow(dead_code)]#[repr(C)] #[derive(Copy)] pub struct UEnumeration;

	#[link(name="icui18n")] #[allow(dead_code)] extern {
	/// http://icu-project.org/apiref/icu4c/utrans_8h.html#a489c970cc731dac45c3bb5cced4ccb52
	pub fn utrans_openIDs_52 (pErrorCode: mut c_int) -> const UEnumeration;
	/// http://icu-project.org/apiref/icu4c/utrans_8h.html#afaa9d751e1d24617ba1071d8d4c887a3
	pub fn utrans_openU_52 (
	// Might be a compound transliterator, like "Latin-Katakana; Katakana-Hiragana" or "Any-Upper; NFD; [:Nonspacing Mark:] Remove; NFC".
	id: *const u16,
	idLength: i32,
	dir: c_int, // UTRANS_FORWARD = 0, UTRANS_REVERSE = 1.
	rules: *const u16,
	rulesLength: i32,
	parseError: *mut UParseError,
	pErrorCode: *mut c_int
	) -> *const UTransliterator;
	pub fn utrans_close_52 (ut: *const UTransliterator);
	/// http://icu-project.org/apiref/icu4c/utrans_8h.html#af415d8aa51e79d4494ebb8ef8fc76ae2
	pub fn utrans_transUChars_52 (
	trans: *const UTransliterator,
	text: *mut u16,
	textLength: *mut i32,
	textCapacity: i32,
	start: i32,
	limit: *mut i32,
	status: *mut c_int);}
	#[link(name="icuuc")] #[allow(dead_code)] extern {
	/// http://icu-project.org/apiref/icu4c/uenum_8h.html#a99298eabaa3874cdfd9793b207848f68
	pub fn uenum_next_52 (ids: const UEnumeration, resultLength: mut i32, status: mut i32) -> const c_char;
	/// http://icu-project.org/apiref/icu4c/uenum_8h.html#af8bf1abcf3a486f07ee3384c7fce89df
	pub fn uenum_close_52 (ids: *const UEnumeration);
	pub fn u_errorName_52 (code: c_int) -> *const c_char;
	pub fn u_strFromUTF8_52 (
	dest: *mut u16,
	destCapacity: i32,
	pDestLength: *mut i32,
	src: *const c_char,
	srcLength: i32,
	pErrorCode: *mut c_int
	) -> *const u16;}

	// (x)<=U_ZERO_ERROR, http://icu-project.org/apiref/icu4c/utypes_8h.html#a527f2c69e6b2e3b2c53ad8a99fb36711
	pub fn u_success (error_code: c_int) -> bool {error_code <= 0}

	// --- High-level interface ---

	/// http://icu-project.org/apiref/icu4c/utypes_8h.html#a3343c1c8a8377277046774691c98d78c
	#[derive(Copy)] pub struct UErrorCode (c_int);
	impl fmt::Debug for UErrorCode {
	fn fmt (&self, fm: &mut fmt::Formatter) -> Result<(), fmt::Error> {
	let error_message = unsafe {u_errorName_52 (match self {&UErrorCode (i) => i})};
	let error_message = if error_message == ptr::null() {"null"} else {
	let slice = Slice {data: error_message, len: unsafe {strlen (error_message)} as usize};
	unsafe {from_utf8_unchecked (transmute (slice))}};
	try! (write! (fm, "UErrorCode ({})", error_message));
	Ok(())}}

	/// The list of supported transformations, from `utrans_openIDs`.
	#[cfg(test)] pub fn ids() -> Result<Vec<String>, UErrorCode> {
	let mut ec = 0;
	let ids = unsafe {utrans_openIDs_52 (&mut ec)};
	if !u_success (ec) {return Err (UErrorCode (ec))}

	let mut rvec = Vec::new();
	loop {
	let mut len = 0;
	ec = 0;
	let id = unsafe {uenum_next_52 (ids, &mut len, &mut ec)};
	if !u_success (ec) {unsafe {uenum_close_52 (ids)}; return Err (UErrorCode (ec))}
	if id == ptr::null() {break}
	if len > 0 {rvec.push (unsafe {String::from_utf8_lossy (from_raw_buf (&(id as *const u8), len as usize)) .into_owned()});}
	}

	unsafe {uenum_close_52 (ids)};
	Ok (rvec)}

	pub struct Transliterator (*const UTransliterator);
	impl Transliterator {
	pub fn run (&self, text: &str) -> Result<String, UErrorCode> {
	let mut buf = try! (utf8_to_utf16 (text));
	let mut textLength = buf.len() as i32; // Remember the size of the original text.
	buf.resize (textLength as usize + 32, 0); // Make some space for the transliterated version in case it's larger than the original.
	{let cap = buf.capacity(); if buf.len() < cap {buf.resize (cap, 0);}} // If the vector still has some extra space then use it as well.
	let mut limit = textLength;
	let mut ec = 0;
	unsafe {utrans_transUChars_52 ( // http://icu-project.org/apiref/icu4c/utrans_8h.html#af415d8aa51e79d4494ebb8ef8fc76ae2
	match self {&Transliterator (ut) => ut},
	buf.as_mut_slice().as_mut_ptr(),
	&mut textLength,
	buf.len() as i32,
	0, // start
	&mut limit,
	&mut ec)};
	if u_success (ec) {
	buf.truncate (limit as usize);
	Ok (String::from_utf16_lossy (buf.as_slice()))
	} else {
	Err (UErrorCode (ec))
	}}}
	impl Drop for Transliterator {fn drop (&mut self) {unsafe {utrans_close_52 (match self {&mut Transliterator (ut) => ut})}}}

	pub fn U_BUFFER_OVERFLOW_ERROR() -> c_int {15}

	pub fn utf8_to_utf16 (text: &str) -> Result<Vec<u16>, UErrorCode> {
	let text = CString::from_slice (text.as_bytes());
	let mut buf: Vec<u16> = repeat (0) .take (text.len() + 32) .collect();
	loop {
	let mut ec = 0; let mut len = 0;
	unsafe {u_strFromUTF8_52 (buf.as_mut_slice().as_mut_ptr(), buf.len() as i32, &mut len, text.as_ptr(), text.len() as i32, &mut ec)};
	if u_success (ec) {buf.truncate (len as usize); return Ok (buf)}
	else if ec == U_BUFFER_OVERFLOW_ERROR() && buf.len() < text.len() * 4 + 32 {buf.resize (text.len(), 0); continue}
	else {return Err (UErrorCode (ec))}}}

	/// * `id` - transliterator id, one of `ids`,
	/// or a compound/filtered one (cf. http://userguide.icu-project.org/transforms/general#TOC-Filtered-IDs).
	/// * `direction` - UTRANS_FORWARD = 0, UTRANS_REVERSE = 1.
	pub fn open_transliterator (id: &str, direction: i8) -> Result<Transliterator, UErrorCode> {
	let id = try! (utf8_to_utf16 (id));
	let mut ec = 0;
	let ut = unsafe {utrans_openU_52 (id[].as_ptr(), id.len() as i32, direction as c_int, ptr::null(), 0, ptr::null_mut(), &mut ec)};
	if u_success (ec) {
	Ok (Transliterator (ut))
	} else {
	if ut != ptr::null() {unsafe {utrans_close_52 (ut)}}
	Err (UErrorCode (ec))
	}}}

	#[test] fn icu_transliteration() {
	use by_db::ICU::*;
	assert! (ids().ok().unwrap().iter().find (\|id\| id.as_slice() == "Russian-Latin/BGN") .is_some());
	let transliterator = open_transliterator ("Russian-Latin/BGN", 0) .ok().unwrap();
	assert_eq! (transliterator.run ("Проверка.") .ok().unwrap().as_slice(), "Proverka.")}