Skip to content

Instantly share code, notes, and snippets.

@xqyww123
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xqyww123/4f34aa48fe74f0215949 to your computer and use it in GitHub Desktop.
Save xqyww123/4f34aa48fe74f0215949 to your computer and use it in GitHub Desktop.
A patch to fix a bug in libmmseg for ruby
#include <fstream>
#include <string>
#include <iostream>
#include <cstdio>
#include <ctype.h>
#include <ruby.h>
/* Ruby 1.7 defines NUM2LL(), LL2NUM() and ULL2NUM() macros */
#ifndef NUM2LL
#define NUM2LL(x) NUM2LONG((x))
#endif
#ifndef LL2NUM
#define LL2NUM(x) INT2NUM((long) (x))
#endif
#ifndef ULL2NUM
#define ULL2NUM(x) UINT2NUM((unsigned long) (x))
#endif
/* Ruby 1.7 doesn't (yet) define NUM2ULL() */
#ifndef NUM2ULL
#ifdef HAVE_LONG_LONG
#define NUM2ULL(x) rb_num2ull((x))
#else
#define NUM2ULL(x) NUM2ULONG(x)
#endif
#endif
/* RSTRING_LEN, etc are new in Ruby 1.9, but ->ptr and ->len no longer work */
/* Define these for older versions so we can just write code the new way */
#ifndef RSTRING_LEN
# define RSTRING_LEN(x) RSTRING(x)->len
#endif
#ifndef RSTRING_PTR
# define RSTRING_PTR(x) RSTRING(x)->ptr
#endif
#ifndef RARRAY_LEN
# define RARRAY_LEN(x) RARRAY(x)->len
#endif
#ifndef RARRAY_PTR
# define RARRAY_PTR(x) RARRAY(x)->ptr
#endif
#include <stdio.h>
#include <stdexcept>
/* calling conventions for Windows */
#ifndef SWIGSTDCALL
# if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__)
# define SWIGSTDCALL __stdcall
# else
# define SWIGSTDCALL
# endif
#endif
#include "SegmenterManager.h"
#include "Segmenter.h"
#ifdef __cplusplus
extern "C" {
#endif
//fixme, unload when so unload?
css::SegmenterManager g_mgr;
int g_bInited = 0;
static void
mmseg_dfree
(void *cd)
{
//printf("needs to clean up");
}
#define MMSEG_FREE mmseg_dfree
static VALUE
mmseg_free
(VALUE cd)
{
//do free here.
return Qnil;
}
static VALUE
check_mmseg
(VALUE obj)
{
Check_Type(obj, T_DATA);
if (RDATA(obj)->dfree != MMSEG_FREE) {
rb_raise(rb_eArgError, "mmseg expected (%s)", rb_class2name(CLASS_OF(obj)));
}
return (VALUE)DATA_PTR(obj);
}
static VALUE
mmseg_s_allocate
(VALUE klass)
{
return Data_Wrap_Struct(klass, 0, MMSEG_FREE, 0);
}
static VALUE
mmseg_initialize(VALUE self){
mmseg_free(check_mmseg(self));
DATA_PTR(self) = NULL;
rb_iv_set(self, "@start", INT2NUM(0));
rb_iv_set(self, "@end", INT2NUM(0));
rb_iv_set(self, "@ri", INT2NUM(0));
return self;
}
static VALUE mmseg_next(VALUE self)
{
u2 tok_len = 0;
int nPos = 0, ri;
css::Segmenter* seg = NULL;
VALUE v_str = rb_iv_get(self, "@str");
char* str = StringValuePtr(v_str);
Data_Get_Struct(self, css::Segmenter, seg);
//printf("%d",seg); //check is got it
if(seg){
u2 len = 0, symlen = 0;
char* tok = (char*)seg->peekToken(len,symlen);
//printf("%s\t",tok);
//printf("xxx %d : %d -- %d\n", len, symlen, tok);
//FIXME: if ruby version do not enbale symlen, the len and symlen always the same.
if(!tok || !*tok || !len)
tok_len = 0;
else
tok_len = len;
seg->popToken(len);
}
//update position info
VALUE vPos = rb_iv_get(self, "@end");
if(!NIL_P(vPos)){
nPos = FIX2INT(vPos);
}
VALUE v_ri = rb_iv_get(self, "@ri");
if (NIL_P(v_ri)) ri = 0;
else ri = FIX2INT(v_ri);
rb_iv_set(self, "@ri", INT2NUM(ri + tok_len));
int rlen = 0, i;
for (i=ri;i<ri+tok_len;++i)
if ((str[i] & 0xc0) != 0x80)
rlen++;
rb_iv_set(self, "@start", INT2NUM(nPos));
rb_iv_set(self, "@end", INT2NUM(nPos+rlen));
if(tok_len)
return self;
else
return Qnil;
}
static VALUE mmseg_start(VALUE self) {
return rb_iv_get(self, "@start");
}
static VALUE mmseg_end(VALUE self) {
return rb_iv_get(self, "@end");
}
static VALUE
mmseg_settext
(VALUE self, VALUE str)
{
int len;
const char* pstr;
if (TYPE(str) == T_STRING) {
str = rb_funcall(str, rb_intern("encode"), 1, rb_str_new2("utf-8"));
len = RSTRING_LEN(str);
pstr = StringValuePtr(str);
//printf("%d:%s\n",len,pstr);
}else
return Qnil;
css::Segmenter* seg = NULL;
Data_Get_Struct(self, css::Segmenter, seg);
//printf("%s",pstr);
seg->setBuffer((u1*)pstr,len);
rb_iv_set(self, "@str", str);
rb_iv_set(self, "@start", INT2NUM(0));
rb_iv_set(self, "@end", INT2NUM(0));
return self;
}
static VALUE
mmseg_open
(VALUE self, VALUE dict_path, VALUE str)
{
int len;
const char* pstr;
if (TYPE(str) == T_STRING) {
str = rb_funcall(str, rb_intern("encode"), 1, rb_str_new2("utf-8"));
len = RSTRING_LEN(str);
pstr = StringValuePtr(str);
//printf("%d:%s\n",len,pstr);
}else
return Qnil;
if (!g_bInited && TYPE(dict_path) == T_STRING) {
int nRet = g_mgr.init(StringValuePtr(dict_path));
if(nRet != 0) {
// should throw an exception
rb_fatal("Can NOT init the segment library.");
return Qnil;
}
g_bInited = 1;
}
if(g_bInited){
//do segment
css::Segmenter* seg = g_mgr.getSegmenter();
//hacking
long ptr = (long)seg;
seg->setBuffer((u1*)pstr,len);
self = Data_Wrap_Struct(self, NULL, MMSEG_FREE, (void *)seg);
}else
return Qnil;
rb_iv_set(self, "@str", str);
return self;
}
VALUE cMMseg;
void Init_mmseg() {
cMMseg = rb_define_class("Mmseg", rb_cData);
rb_define_alloc_func(cMMseg, mmseg_s_allocate);
rb_define_singleton_method(cMMseg, "createSeg", RUBY_METHOD_FUNC(mmseg_open), 2);
rb_define_method(cMMseg, "initialize", RUBY_METHOD_FUNC(mmseg_initialize), 0);
rb_define_method(cMMseg, "setText", RUBY_METHOD_FUNC(mmseg_settext), 1);
rb_define_method(cMMseg, "next", RUBY_METHOD_FUNC(mmseg_next), 0);
rb_define_method(cMMseg, "start", RUBY_METHOD_FUNC(mmseg_start), 0);
rb_define_method(cMMseg, "end", RUBY_METHOD_FUNC(mmseg_end), 0);
}
#ifdef __cplusplus
}
#endif
# a silly bug return index of utf-8 c-string rather than character index = =
*** rubyapi.cpp 2014-11-20 14:34:45.735467250 +0800
--- rubyapi.cpp 2010-05-03 03:16:17.000000000 +0800
***************
*** 104,127 ****
DATA_PTR(self) = NULL;
rb_iv_set(self, "@start", INT2NUM(0));
rb_iv_set(self, "@end", INT2NUM(0));
- rb_iv_set(self, "@ri", INT2NUM(0));
return self;
}
static VALUE mmseg_next(VALUE self)
{
u2 tok_len = 0;
! int nPos = 0, ri;
css::Segmenter* seg = NULL;
- VALUE v_str = rb_iv_get(self, "@str");
- char* str = StringValuePtr(v_str);
Data_Get_Struct(self, css::Segmenter, seg);
//printf("%d",seg); //check is got it
if(seg){
u2 len = 0, symlen = 0;
char* tok = (char*)seg->peekToken(len,symlen);
//printf("%s\t",tok);
- //printf("xxx %d : %d -- %d\n", len, symlen, tok);
//FIXME: if ruby version do not enbale symlen, the len and symlen always the same.
if(!tok || !*tok || !len)
tok_len = 0;
--- 104,123 ----
DATA_PTR(self) = NULL;
rb_iv_set(self, "@start", INT2NUM(0));
rb_iv_set(self, "@end", INT2NUM(0));
return self;
}
static VALUE mmseg_next(VALUE self)
{
u2 tok_len = 0;
! int nPos = 0;
css::Segmenter* seg = NULL;
Data_Get_Struct(self, css::Segmenter, seg);
//printf("%d",seg); //check is got it
if(seg){
u2 len = 0, symlen = 0;
char* tok = (char*)seg->peekToken(len,symlen);
//printf("%s\t",tok);
//FIXME: if ruby version do not enbale symlen, the len and symlen always the same.
if(!tok || !*tok || !len)
tok_len = 0;
***************
*** 134,149 ****
if(!NIL_P(vPos)){
nPos = FIX2INT(vPos);
}
- VALUE v_ri = rb_iv_get(self, "@ri");
- if (NIL_P(v_ri)) ri = 0;
- else ri = FIX2INT(v_ri);
- rb_iv_set(self, "@ri", INT2NUM(ri + tok_len));
- int rlen = 0, i;
- for (i=ri;i<ri+tok_len;++i)
- if ((str[i] & 0xc0) != 0x80)
- rlen++;
rb_iv_set(self, "@start", INT2NUM(nPos));
! rb_iv_set(self, "@end", INT2NUM(nPos+rlen));
if(tok_len)
return self;
else
--- 130,137 ----
if(!NIL_P(vPos)){
nPos = FIX2INT(vPos);
}
rb_iv_set(self, "@start", INT2NUM(nPos));
! rb_iv_set(self, "@end", INT2NUM(nPos+tok_len));
if(tok_len)
return self;
else
***************
*** 166,174 ****
int len;
const char* pstr;
if (TYPE(str) == T_STRING) {
- str = rb_funcall(str, rb_intern("encode"), 1, rb_str_new2("utf-8"));
len = RSTRING_LEN(str);
! pstr = StringValuePtr(str);
//printf("%d:%s\n",len,pstr);
}else
return Qnil;
--- 154,161 ----
int len;
const char* pstr;
if (TYPE(str) == T_STRING) {
len = RSTRING_LEN(str);
! pstr = STR2CSTR(str);
//printf("%d:%s\n",len,pstr);
}else
return Qnil;
***************
*** 177,183 ****
Data_Get_Struct(self, css::Segmenter, seg);
//printf("%s",pstr);
seg->setBuffer((u1*)pstr,len);
- rb_iv_set(self, "@str", str);
rb_iv_set(self, "@start", INT2NUM(0));
rb_iv_set(self, "@end", INT2NUM(0));
return self;
--- 164,169 ----
***************
*** 191,205 ****
int len;
const char* pstr;
if (TYPE(str) == T_STRING) {
- str = rb_funcall(str, rb_intern("encode"), 1, rb_str_new2("utf-8"));
len = RSTRING_LEN(str);
! pstr = StringValuePtr(str);
//printf("%d:%s\n",len,pstr);
}else
return Qnil;
if (!g_bInited && TYPE(dict_path) == T_STRING) {
! int nRet = g_mgr.init(StringValuePtr(dict_path));
if(nRet != 0) {
// should throw an exception
rb_fatal("Can NOT init the segment library.");
--- 177,190 ----
int len;
const char* pstr;
if (TYPE(str) == T_STRING) {
len = RSTRING_LEN(str);
! pstr = STR2CSTR(str);
//printf("%d:%s\n",len,pstr);
}else
return Qnil;
if (!g_bInited && TYPE(dict_path) == T_STRING) {
! int nRet = g_mgr.init(STR2CSTR(dict_path));
if(nRet != 0) {
// should throw an exception
rb_fatal("Can NOT init the segment library.");
***************
*** 217,223 ****
}else
return Qnil;
- rb_iv_set(self, "@str", str);
return self;
}
--- 202,207 ----
***************
*** 236,239 ****
#ifdef __cplusplus
}
! #endif
--- 220,223 ----
#ifdef __cplusplus
}
! #endif
\ 文件尾没有 newline 字符
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment