Skip to content

Instantly share code, notes, and snippets.

@ArtemGr
Created January 25, 2015 21:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ArtemGr/91e88de7e17fbc571926 to your computer and use it in GitHub Desktop.
Save ArtemGr/91e88de7e17fbc571926 to your computer and use it in GitHub Desktop.
Partial ICU bindings for Rust
pub mod ICU {
// --- Hand-made ICU bindings ---
// $ apt-get install -y libicu-dev
// To check the argument types:
// $ apt-get install -y libicu52-dbg
// $ cd /usr/lib/x86_64-linux-gnu && gdb libicui18n.so
// ) print utrans_open_52
// ) ptype UChar
// ) ptype UParseError
use libc::{c_int, c_char};
use libc::funcs::c95::string::strlen;
use std::fmt;
use std::ffi::CString;
use std::iter::repeat;
use std::mem::transmute;
use std::ptr;
use std::raw::Slice;
use std::str::from_utf8_unchecked;
#[cfg(test)] use std::slice::from_raw_buf;
// http://icu-project.org/apiref/icu4c/structUParseError.html
#[repr(C)] #[derive(Copy)] pub struct UParseError {line: i32, offset: i32, pre_context: u16, post_context: u16}
#[repr(C)] #[derive(Copy)] pub struct UTransliterator;
// There's more to it, but we only need the pointer.
#[allow(dead_code)]#[repr(C)] #[derive(Copy)] pub struct UEnumeration;
#[link(name="icui18n")] #[allow(dead_code)] extern {
/// http://icu-project.org/apiref/icu4c/utrans_8h.html#a489c970cc731dac45c3bb5cced4ccb52
pub fn utrans_openIDs_52 (pErrorCode: *mut c_int) -> *const UEnumeration;
/// http://icu-project.org/apiref/icu4c/utrans_8h.html#afaa9d751e1d24617ba1071d8d4c887a3
pub fn utrans_openU_52 (
// Might be a compound transliterator, like "Latin-Katakana; Katakana-Hiragana" or "Any-Upper; NFD; [:Nonspacing Mark:] Remove; NFC".
id: *const u16,
idLength: i32,
dir: c_int, // UTRANS_FORWARD = 0, UTRANS_REVERSE = 1.
rules: *const u16,
rulesLength: i32,
parseError: *mut UParseError,
pErrorCode: *mut c_int
) -> *const UTransliterator;
pub fn utrans_close_52 (ut: *const UTransliterator);
/// http://icu-project.org/apiref/icu4c/utrans_8h.html#af415d8aa51e79d4494ebb8ef8fc76ae2
pub fn utrans_transUChars_52 (
trans: *const UTransliterator,
text: *mut u16,
textLength: *mut i32,
textCapacity: i32,
start: i32,
limit: *mut i32,
status: *mut c_int);}
#[link(name="icuuc")] #[allow(dead_code)] extern {
/// http://icu-project.org/apiref/icu4c/uenum_8h.html#a99298eabaa3874cdfd9793b207848f68
pub fn uenum_next_52 (ids: *const UEnumeration, resultLength: *mut i32, status: *mut i32) -> *const c_char;
/// http://icu-project.org/apiref/icu4c/uenum_8h.html#af8bf1abcf3a486f07ee3384c7fce89df
pub fn uenum_close_52 (ids: *const UEnumeration);
pub fn u_errorName_52 (code: c_int) -> *const c_char;
pub fn u_strFromUTF8_52 (
dest: *mut u16,
destCapacity: i32,
pDestLength: *mut i32,
src: *const c_char,
srcLength: i32,
pErrorCode: *mut c_int
) -> *const u16;}
// (x)<=U_ZERO_ERROR, http://icu-project.org/apiref/icu4c/utypes_8h.html#a527f2c69e6b2e3b2c53ad8a99fb36711
pub fn u_success (error_code: c_int) -> bool {error_code <= 0}
// --- High-level interface ---
/// http://icu-project.org/apiref/icu4c/utypes_8h.html#a3343c1c8a8377277046774691c98d78c
#[derive(Copy)] pub struct UErrorCode (c_int);
impl fmt::Debug for UErrorCode {
fn fmt (&self, fm: &mut fmt::Formatter) -> Result<(), fmt::Error> {
let error_message = unsafe {u_errorName_52 (match self {&UErrorCode (i) => i})};
let error_message = if error_message == ptr::null() {"null"} else {
let slice = Slice {data: error_message, len: unsafe {strlen (error_message)} as usize};
unsafe {from_utf8_unchecked (transmute (slice))}};
try! (write! (fm, "UErrorCode ({})", error_message));
Ok(())}}
/// The list of supported transformations, from `utrans_openIDs`.
#[cfg(test)] pub fn ids() -> Result<Vec<String>, UErrorCode> {
let mut ec = 0;
let ids = unsafe {utrans_openIDs_52 (&mut ec)};
if !u_success (ec) {return Err (UErrorCode (ec))}
let mut rvec = Vec::new();
loop {
let mut len = 0;
ec = 0;
let id = unsafe {uenum_next_52 (ids, &mut len, &mut ec)};
if !u_success (ec) {unsafe {uenum_close_52 (ids)}; return Err (UErrorCode (ec))}
if id == ptr::null() {break}
if len > 0 {rvec.push (unsafe {String::from_utf8_lossy (from_raw_buf (&(id as *const u8), len as usize)) .into_owned()});}
}
unsafe {uenum_close_52 (ids)};
Ok (rvec)}
pub struct Transliterator (*const UTransliterator);
impl Transliterator {
pub fn run (&self, text: &str) -> Result<String, UErrorCode> {
let mut buf = try! (utf8_to_utf16 (text));
let mut textLength = buf.len() as i32; // Remember the size of the original text.
buf.resize (textLength as usize + 32, 0); // Make some space for the transliterated version in case it's larger than the original.
{let cap = buf.capacity(); if buf.len() < cap {buf.resize (cap, 0);}} // If the vector still has some extra space then use it as well.
let mut limit = textLength;
let mut ec = 0;
unsafe {utrans_transUChars_52 ( // http://icu-project.org/apiref/icu4c/utrans_8h.html#af415d8aa51e79d4494ebb8ef8fc76ae2
match self {&Transliterator (ut) => ut},
buf.as_mut_slice().as_mut_ptr(),
&mut textLength,
buf.len() as i32,
0, // start
&mut limit,
&mut ec)};
if u_success (ec) {
buf.truncate (limit as usize);
Ok (String::from_utf16_lossy (buf.as_slice()))
} else {
Err (UErrorCode (ec))
}}}
impl Drop for Transliterator {fn drop (&mut self) {unsafe {utrans_close_52 (match self {&mut Transliterator (ut) => ut})}}}
pub fn U_BUFFER_OVERFLOW_ERROR() -> c_int {15}
pub fn utf8_to_utf16 (text: &str) -> Result<Vec<u16>, UErrorCode> {
let text = CString::from_slice (text.as_bytes());
let mut buf: Vec<u16> = repeat (0) .take (text.len() + 32) .collect();
loop {
let mut ec = 0; let mut len = 0;
unsafe {u_strFromUTF8_52 (buf.as_mut_slice().as_mut_ptr(), buf.len() as i32, &mut len, text.as_ptr(), text.len() as i32, &mut ec)};
if u_success (ec) {buf.truncate (len as usize); return Ok (buf)}
else if ec == U_BUFFER_OVERFLOW_ERROR() && buf.len() < text.len() * 4 + 32 {buf.resize (text.len(), 0); continue}
else {return Err (UErrorCode (ec))}}}
/// * `id` - transliterator id, one of `ids`,
/// or a compound/filtered one (cf. http://userguide.icu-project.org/transforms/general#TOC-Filtered-IDs).
/// * `direction` - UTRANS_FORWARD = 0, UTRANS_REVERSE = 1.
pub fn open_transliterator (id: &str, direction: i8) -> Result<Transliterator, UErrorCode> {
let id = try! (utf8_to_utf16 (id));
let mut ec = 0;
let ut = unsafe {utrans_openU_52 (id[].as_ptr(), id.len() as i32, direction as c_int, ptr::null(), 0, ptr::null_mut(), &mut ec)};
if u_success (ec) {
Ok (Transliterator (ut))
} else {
if ut != ptr::null() {unsafe {utrans_close_52 (ut)}}
Err (UErrorCode (ec))
}}}
#[test] fn icu_transliteration() {
use by_db::ICU::*;
assert! (ids().ok().unwrap().iter().find (|id| id.as_slice() == "Russian-Latin/BGN") .is_some());
let transliterator = open_transliterator ("Russian-Latin/BGN", 0) .ok().unwrap();
assert_eq! (transliterator.run ("Проверка.") .ok().unwrap().as_slice(), "Proverka.")}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment