Created
January 25, 2015 21:05
-
-
Save ArtemGr/91e88de7e17fbc571926 to your computer and use it in GitHub Desktop.
Partial ICU bindings for Rust
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pub mod ICU { | |
// --- Hand-made ICU bindings --- | |
// $ apt-get install -y libicu-dev | |
// To check the argument types: | |
// $ apt-get install -y libicu52-dbg | |
// $ cd /usr/lib/x86_64-linux-gnu && gdb libicui18n.so | |
// ) print utrans_open_52 | |
// ) ptype UChar | |
// ) ptype UParseError | |
use libc::{c_int, c_char}; | |
use libc::funcs::c95::string::strlen; | |
use std::fmt; | |
use std::ffi::CString; | |
use std::iter::repeat; | |
use std::mem::transmute; | |
use std::ptr; | |
use std::raw::Slice; | |
use std::str::from_utf8_unchecked; | |
#[cfg(test)] use std::slice::from_raw_buf; | |
// http://icu-project.org/apiref/icu4c/structUParseError.html | |
#[repr(C)] #[derive(Copy)] pub struct UParseError {line: i32, offset: i32, pre_context: u16, post_context: u16} | |
#[repr(C)] #[derive(Copy)] pub struct UTransliterator; | |
// There's more to it, but we only need the pointer. | |
#[allow(dead_code)]#[repr(C)] #[derive(Copy)] pub struct UEnumeration; | |
#[link(name="icui18n")] #[allow(dead_code)] extern { | |
/// http://icu-project.org/apiref/icu4c/utrans_8h.html#a489c970cc731dac45c3bb5cced4ccb52 | |
pub fn utrans_openIDs_52 (pErrorCode: *mut c_int) -> *const UEnumeration; | |
/// http://icu-project.org/apiref/icu4c/utrans_8h.html#afaa9d751e1d24617ba1071d8d4c887a3 | |
pub fn utrans_openU_52 ( | |
// Might be a compound transliterator, like "Latin-Katakana; Katakana-Hiragana" or "Any-Upper; NFD; [:Nonspacing Mark:] Remove; NFC". | |
id: *const u16, | |
idLength: i32, | |
dir: c_int, // UTRANS_FORWARD = 0, UTRANS_REVERSE = 1. | |
rules: *const u16, | |
rulesLength: i32, | |
parseError: *mut UParseError, | |
pErrorCode: *mut c_int | |
) -> *const UTransliterator; | |
pub fn utrans_close_52 (ut: *const UTransliterator); | |
/// http://icu-project.org/apiref/icu4c/utrans_8h.html#af415d8aa51e79d4494ebb8ef8fc76ae2 | |
pub fn utrans_transUChars_52 ( | |
trans: *const UTransliterator, | |
text: *mut u16, | |
textLength: *mut i32, | |
textCapacity: i32, | |
start: i32, | |
limit: *mut i32, | |
status: *mut c_int);} | |
#[link(name="icuuc")] #[allow(dead_code)] extern { | |
/// http://icu-project.org/apiref/icu4c/uenum_8h.html#a99298eabaa3874cdfd9793b207848f68 | |
pub fn uenum_next_52 (ids: *const UEnumeration, resultLength: *mut i32, status: *mut i32) -> *const c_char; | |
/// http://icu-project.org/apiref/icu4c/uenum_8h.html#af8bf1abcf3a486f07ee3384c7fce89df | |
pub fn uenum_close_52 (ids: *const UEnumeration); | |
pub fn u_errorName_52 (code: c_int) -> *const c_char; | |
pub fn u_strFromUTF8_52 ( | |
dest: *mut u16, | |
destCapacity: i32, | |
pDestLength: *mut i32, | |
src: *const c_char, | |
srcLength: i32, | |
pErrorCode: *mut c_int | |
) -> *const u16;} | |
// (x)<=U_ZERO_ERROR, http://icu-project.org/apiref/icu4c/utypes_8h.html#a527f2c69e6b2e3b2c53ad8a99fb36711 | |
pub fn u_success (error_code: c_int) -> bool {error_code <= 0} | |
// --- High-level interface --- | |
/// http://icu-project.org/apiref/icu4c/utypes_8h.html#a3343c1c8a8377277046774691c98d78c | |
#[derive(Copy)] pub struct UErrorCode (c_int); | |
impl fmt::Debug for UErrorCode { | |
fn fmt (&self, fm: &mut fmt::Formatter) -> Result<(), fmt::Error> { | |
let error_message = unsafe {u_errorName_52 (match self {&UErrorCode (i) => i})}; | |
let error_message = if error_message == ptr::null() {"null"} else { | |
let slice = Slice {data: error_message, len: unsafe {strlen (error_message)} as usize}; | |
unsafe {from_utf8_unchecked (transmute (slice))}}; | |
try! (write! (fm, "UErrorCode ({})", error_message)); | |
Ok(())}} | |
/// The list of supported transformations, from `utrans_openIDs`. | |
#[cfg(test)] pub fn ids() -> Result<Vec<String>, UErrorCode> { | |
let mut ec = 0; | |
let ids = unsafe {utrans_openIDs_52 (&mut ec)}; | |
if !u_success (ec) {return Err (UErrorCode (ec))} | |
let mut rvec = Vec::new(); | |
loop { | |
let mut len = 0; | |
ec = 0; | |
let id = unsafe {uenum_next_52 (ids, &mut len, &mut ec)}; | |
if !u_success (ec) {unsafe {uenum_close_52 (ids)}; return Err (UErrorCode (ec))} | |
if id == ptr::null() {break} | |
if len > 0 {rvec.push (unsafe {String::from_utf8_lossy (from_raw_buf (&(id as *const u8), len as usize)) .into_owned()});} | |
} | |
unsafe {uenum_close_52 (ids)}; | |
Ok (rvec)} | |
pub struct Transliterator (*const UTransliterator); | |
impl Transliterator { | |
pub fn run (&self, text: &str) -> Result<String, UErrorCode> { | |
let mut buf = try! (utf8_to_utf16 (text)); | |
let mut textLength = buf.len() as i32; // Remember the size of the original text. | |
buf.resize (textLength as usize + 32, 0); // Make some space for the transliterated version in case it's larger than the original. | |
{let cap = buf.capacity(); if buf.len() < cap {buf.resize (cap, 0);}} // If the vector still has some extra space then use it as well. | |
let mut limit = textLength; | |
let mut ec = 0; | |
unsafe {utrans_transUChars_52 ( // http://icu-project.org/apiref/icu4c/utrans_8h.html#af415d8aa51e79d4494ebb8ef8fc76ae2 | |
match self {&Transliterator (ut) => ut}, | |
buf.as_mut_slice().as_mut_ptr(), | |
&mut textLength, | |
buf.len() as i32, | |
0, // start | |
&mut limit, | |
&mut ec)}; | |
if u_success (ec) { | |
buf.truncate (limit as usize); | |
Ok (String::from_utf16_lossy (buf.as_slice())) | |
} else { | |
Err (UErrorCode (ec)) | |
}}} | |
impl Drop for Transliterator {fn drop (&mut self) {unsafe {utrans_close_52 (match self {&mut Transliterator (ut) => ut})}}} | |
pub fn U_BUFFER_OVERFLOW_ERROR() -> c_int {15} | |
pub fn utf8_to_utf16 (text: &str) -> Result<Vec<u16>, UErrorCode> { | |
let text = CString::from_slice (text.as_bytes()); | |
let mut buf: Vec<u16> = repeat (0) .take (text.len() + 32) .collect(); | |
loop { | |
let mut ec = 0; let mut len = 0; | |
unsafe {u_strFromUTF8_52 (buf.as_mut_slice().as_mut_ptr(), buf.len() as i32, &mut len, text.as_ptr(), text.len() as i32, &mut ec)}; | |
if u_success (ec) {buf.truncate (len as usize); return Ok (buf)} | |
else if ec == U_BUFFER_OVERFLOW_ERROR() && buf.len() < text.len() * 4 + 32 {buf.resize (text.len(), 0); continue} | |
else {return Err (UErrorCode (ec))}}} | |
/// * `id` - transliterator id, one of `ids`, | |
/// or a compound/filtered one (cf. http://userguide.icu-project.org/transforms/general#TOC-Filtered-IDs). | |
/// * `direction` - UTRANS_FORWARD = 0, UTRANS_REVERSE = 1. | |
pub fn open_transliterator (id: &str, direction: i8) -> Result<Transliterator, UErrorCode> { | |
let id = try! (utf8_to_utf16 (id)); | |
let mut ec = 0; | |
let ut = unsafe {utrans_openU_52 (id[].as_ptr(), id.len() as i32, direction as c_int, ptr::null(), 0, ptr::null_mut(), &mut ec)}; | |
if u_success (ec) { | |
Ok (Transliterator (ut)) | |
} else { | |
if ut != ptr::null() {unsafe {utrans_close_52 (ut)}} | |
Err (UErrorCode (ec)) | |
}}} | |
#[test] fn icu_transliteration() { | |
use by_db::ICU::*; | |
assert! (ids().ok().unwrap().iter().find (|id| id.as_slice() == "Russian-Latin/BGN") .is_some()); | |
let transliterator = open_transliterator ("Russian-Latin/BGN", 0) .ok().unwrap(); | |
assert_eq! (transliterator.run ("Проверка.") .ok().unwrap().as_slice(), "Proverka.")} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment