Skip to content

Instantly share code, notes, and snippets.

Created December 4, 2014 11:35
Show Gist options
  • Save lengshuiyulangcn/b65e51d4a7e068b9cc62 to your computer and use it in GitHub Desktop.
Save lengshuiyulangcn/b65e51d4a7e068b9cc62 to your computer and use it in GitHub Desktop.
swig and head file for nlpir
* NLPIR/ICTCLAS Lexical Analysis System Copyright (c) 2000-2014
* Dr. Kevin Zhang (Hua-Ping Zhang)
* All rights reserved.
* This file is the confidential and proprietary property of
* Kevin Zhang and the possession or use of this file requires
* a written license from the author.
* Filename:
* Abstract:
* NLPIR.h: definition of the NLPIR lexical analysis system API
* Author: Kevin Zhang
* Email:
* Weibo:
* Homepage:
* Date: 2013-12-19
* Notes:
#if !defined(__NLPIR_ICTCLAS_2014_H_INCLUDED__)
#define OS_LINUX linux
#ifdef OS_LINUX
#define NLPIR_API extern "C"
#define NLPIR_API extern "C" __declspec(dllexport)
#define NLPIR_API extern "C" __declspec(dllimport)
#define NLPIR_API
#define ICTCLAS_Init NLPIR_Init
#define ICTCLAS_Exit NLPIR_Exit
#define ICTCLAS_ImportUserDict NLPIR_ImportUserDict
#define ICTCLAS_FileProcess NLPIR_FileProcess
#define ICTCLAS_ParagraphProcess NLPIR_ParagraphProcess
#define ICTCLAS_ParagraphProcessA NLPIR_ParagraphProcessA
#define ICTCLAS_GetParagraphProcessAWordCount NLPIR_GetParagraphProcessAWordCount
#define ICTCLAS_ParagraphProcessAW NLPIR_ParagraphProcessAW
#define ICTCLAS_AddUserWord NLPIR_AddUserWord
#define ICTCLAS_SaveTheUsrDic NLPIR_SaveTheUsrDic
#define ICTCLAS_DelUsrWord NLPIR_DelUsrWord
#define ICTCLAS_GetUniProb NLPIR_GetUniProb
#define ICTCLAS_IsWord NLPIR_IsWord
#define ICTCLAS_SetPOSmap NLPIR_SetPOSmap
#define GetActiveICTCLAS GetActiveInstance
#define POS_MAP_NUMBER 4 //add by qp 2008.11.25
#define ICT_POS_MAP_FIRST 1 //计算所一级标注集
#define ICT_POS_MAP_SECOND 0 //计算所二级标注集
#define PKU_POS_MAP_SECOND 2 //北大二级标注集
#define PKU_POS_MAP_FIRST 3 //北大一级标注集
#define POS_SIZE 40
struct result_t{
int start; //start position,词语在输入句子中的开始位置
int length; //length,词语的长度
char sPOS[POS_SIZE];//word type,词性ID值,可以快速的获取词性表
int iPOS;//词性标注的编号
int word_ID; //该词的内部ID号,如果是未登录词,设成0或者-1
int word_type; //区分用户词典;1,是用户词典中的词;0,非用户词典中的词
int weight;//word weight,read weight
#define GBK_CODE 0//默认支持GBK编码
#define UTF8_CODE GBK_CODE+1//UTF8编码
#define BIG5_CODE GBK_CODE+2//BIG5编码
#define GBK_FANTI_CODE GBK_CODE+3//GBK编码,里面包含繁体字
* Func Name : Init
* Description: Init NLPIR
* The function must be invoked before any operation listed as following
* Parameters : const char * sInitDirPath=NULL
* sDataPath: Path where Data directory stored.
* the default value is NULL, it indicates the initial directory is current working directory path
* encode: encoding code;
* sLicenseCode: license code for unlimited usage. common user ignore it
* Returns : success or fail
* Author : Kevin Zhang
* History :
* 1.create 2013-6-8
NLPIR_API int NLPIR_Init(const char * sDataPath=0,int encode=GBK_CODE,const char*sLicenceCode=0);
* Func Name : NLPIR_Exit
* Description: Exist NLPIR and free related buffer
* Exit the program and free memory
* The function must be invoked while you needn't any lexical anlysis
* Parameters : None
* Returns : success or fail
* Author : Kevin Zhang
* History :
* 1.create 2002-8-6
NLPIR_API bool NLPIR_Exit();
* Func Name : ParagraphProcessing
* Description: Process a paragraph
* Parameters : sParagraph: The source paragraph
* bPOStagged:Judge whether need POS tagging, 0 for no tag;default:1
* i.e. 张华平于1978年3月9日出生于江西省波阳县。
* Result: 张华平/nr 于/p 1978年/t 3月/t 9日/t 出生于/v 江西省/ns 波阳县/ns 。/w
* Returns : the result buffer pointer
* Author : Kevin Zhang
* History :
* 1.create 2003-12-22
NLPIR_API const char * NLPIR_ParagraphProcess(const char *sParagraph,int bPOStagged=1);
* Func Name : NLPIR_ReleaseBuf
* Description: free buffer
* Parameters : sBuf: return buffer
* Returns : the result buffer pointer
* Author : Kevin Zhang
* History :
* 1.create 2013-11-6
//NLPIR_API void NLPIR_ReleaseBuf(const char *sBuf);
* Func Name : ParagraphProcessingA
* Description: Process a paragraph
* Parameters : sParagraph: The source paragraph
* pResultCount: pointer to result vector size
* Returns : the pointer of result vector, it is managed by system,user cannot alloc and free it
* Author : Kevin Zhang
* History :
* 1.create 2006-10-26
NLPIR_API const result_t * NLPIR_ParagraphProcessA(const char *sParagraph,int *pResultCount,bool bUserDict=true);
* Func Name : NLPIR_GetParagraphProcessAWordCount
* Description: Get ProcessAWordCount, API for C#
* Get word count and it helps us prepare the proper size buffer for result_t vector
* Parameters : sParagraph: The source paragraph
* Returns : result vector size
* Author : Kevin Zhang
* History :
* 1.create 2007-3-15
NLPIR_API int NLPIR_GetParagraphProcessAWordCount(const char *sParagraph);
* Func Name : NLPIR_ParagraphProcessAW
* Description: Process a paragraph, API for C#
* Parameters : sParagraph: The source paragraph
* result_t * result: pointer to result vector size, it is allocated by the invoker
* Returns : None
* Author :
* History :
* 1.create 2007-3-15
NLPIR_API void NLPIR_ParagraphProcessAW(int nCount,result_t * result);
* Func Name : NLPIR_FileProcess
* Description: Process a text file
* Parameters : sSourceFilename: The source file name
* sResultFilename: The result file name
* bPOStagged:Judge whether need POS tagging, 0 for no tag;default:1
* i.e. FileProcess("E:\\Sample\\Corpus_NewPOS\\199802_Org.txt","E:\\Sample\\Corpus_NewPOS\\199802_Org_cla.txt");
* Returns : success:
* fail:
* Author : Kevin Zhang
* History :
* 1.create 2005-11-22
NLPIR_API double NLPIR_FileProcess(const char *sSourceFilename,const char *sResultFilename,int bPOStagged=1);
* Func Name : ImportUserDict
* Description: Import User-defined dictionary
* Parameters : Text filename for user dictionary
* Returns : The number of lexical entry imported successfully
* Author : Kevin Zhang
* History :
* 1.create 2003-11-28
NLPIR_API unsigned int NLPIR_ImportUserDict(const char *sFilename);
* Func Name : NLPIR_AddUserWord
* Description: add a word to the user dictionary ,example:你好
* i3s n
* Parameters : sFilename: file name
* Returns : 1,true ; 0,false
* Author :
* History :
* 1.create 11:10:2008
NLPIR_API int NLPIR_AddUserWord(const char *sWord);//add by qp 2008.11.10
* Func Name : Save
* Description: Save dictionary to file
* Parameters :
* Returns : 1,true; 2,false
* Author :
* History :
* 1.create 11:10:2008
NLPIR_API int NLPIR_SaveTheUsrDic();
* Func Name : NLPIR_DelUsrWord
* Description: delete a word from the user dictionary
* Parameters :
* Returns : -1, the word not exist in the user dictionary; else, the handle of the word deleted
* Author :
* History :
* 1.create 11:10:2008
NLPIR_API int NLPIR_DelUsrWord(const char *sWord);
* Func Name : NLPIR_GetUniProb
* Description: Get Unigram Probability
* Parameters : sSourceFilename: The source file name
* sResultFilename: The result file name
* i.e. FileProcess("E:\\Sample\\Corpus_NewPOS\\199802_Org.txt","E:\\Sample\\Corpus_NewPOS\\199802_Org_cla.txt");
* Returns : success:
* fail:
* Author : Kevin Zhang
* History :
* 1.create 2005-11-22
NLPIR_API double NLPIR_GetUniProb(const char *sWord);
* Func Name : NLPIR_IsWord
* Description: Get Unigram Probability
* Parameters : sSourceFilename: The source file name
* sResultFilename: The result file name
* i.e. FileProcess("E:\\Sample\\Corpus_NewPOS\\199802_Org.txt","E:\\Sample\\Corpus_NewPOS\\199802_Org_cla.txt");
* Returns : success:
* fail:
* Author : Kevin Zhang
* History :
* 1.create 2005-11-22
NLPIR_API bool NLPIR_IsWord(const char *sWord);
* Func Name : NLPIR_GetKeyWords
* Description: Extract keyword from sLine
* Parameters : sLine, the input paragraph
bArguOut,whether the keyword weight output
nMaxKeyLimt:maximum of key words, up to 50
* Returns : keywords list like:
* "科学发展观 宏观经济 " or
"科学发展观 23.80 宏观经济 12.20" with weight
* Author :
* History :
* 1.create 2012/11/12
NLPIR_API const char * NLPIR_GetKeyWords(const char *sLine,int nMaxKeyLimit=50,bool bWeightOut=false);
* Func Name : NLPIR_GetFileKeyWords
* Description: Extract keyword from a text file
* Parameters : sFilename, the input text file name
bArguOut,whether the keyword weight output
nMaxKeyLimt:maximum of key words, up to 50
* Returns : keywords list like:
* "科学发展观 宏观经济 " or
"科学发展观 23.80 宏观经济 12.20" with weight
* Author :
* History :
* 1.create 2012/11/12
NLPIR_API const char * NLPIR_GetFileKeyWords(const char *sFilename,int nMaxKeyLimit=50,bool bWeightOut=false);
* Func Name : NLPIR_GetNewWords
* Description: Extract New words from sLine
* Parameters : sLine, the input paragraph
bArguOut,whether the keyword weight output
nMaxKeyLimt:maximum of key words, up to 50
* Returns : new words list like:
* "科学发展观 屌丝 "or
"科学发展观 23.80 屌丝 12.20" with weight
* Author :
* History :
* 1.create 2012/11/12
NLPIR_API const char * NLPIR_GetNewWords(const char *sLine,int nMaxKeyLimit=50,bool bWeightOut=false);
* Func Name : NLPIR_GetFileNewWords
* Description: Extract new words from a text file
* Parameters : sFilename, the input text file name
bArguOut,whether the keyword weight output
nMaxKeyLimt:maximum of key words, up to 50
* Returns : keywords list like:
* "科学发展观 宏观经济 " or
"科学发展观 23.80 宏观经济 12.20" with weight
* Author :
* History :
* 1.create 2012/11/12
NLPIR_API const char * NLPIR_GetFileNewWords(const char *sFilename,int nMaxKeyLimit=50,bool bWeightOut=false);
* Func Name : NLPIR_FingerPrint
* Description: Extract a finger print from the paragraph
* Parameters :
* Returns : 0, failed; else, the finger print of the content
* Author :
* History :
* 1.create 11:10:2008
NLPIR_API unsigned long NLPIR_FingerPrint(const char *sLine);
* Func Name : NLPIR_SetPOSmap
* Description: select which pos map will use
* Parameters :nPOSmap, ICT_POS_MAP_FIRST 计算所一级标注集
* Returns : 0, failed; else, success
* Author :
* History :
* 1.create 11:10:2008
NLPIR_API int NLPIR_SetPOSmap(int nPOSmap);
* class CNLPIR
* 描述:
* NLPIR 类,使用之前必须调用NLPIR_Init(),退出必须调用NLPIR_Exit
* 在使用过程中可以使用多份CNLPIR,支持多线程分词处理
* 每个线程先调用GetActiveInstance,获取处理类,然后,设置SetAvailable(false)宣示线程主权,
* 处理完成后,SetAvailable(true)释放线程主权
* History :
* 1.create 2005-11-10
#ifdef OS_LINUX
class CNLPIR {
class __declspec(dllexport) CNLPIR {
double FileProcess(const char *sSourceFilename,const char *sResultFilename,int bPOStagged=1);
//Process a file,类似于C下的NLPIR_FileProcess
const char * ParagraphProcess(const char *sLine,int bPOStagged=1);
//Process a file,类似于C下的NLPIR_ParagraphProcess
const result_t * ParagraphProcessA(const char *sParagraph,int *pResultCount,bool bUserDict=true);
//Process a file,类似于C下的NLPIR_ParagraphProcessA
void ParagraphProcessAW(int nCount,result_t * result);
int GetParagraphProcessAWordCount(const char *sParagraph);
const char * GetKeyWords(const char *sLine,int nMaxKeyLimit,bool bWeightOut);
const char * GetFileKeyWords(const char *sFilename,int nMaxKeyLimit,bool bWeightOut);
const char * GetNewWords(const char *sFilename,int nMaxKeyLimit,bool bWeightOut);
const char * GetFileNewWords(const char *sFilename,int nMaxKeyLimit,bool bWeightOut);
bool SetAvailable(bool bAvailable=true);//当前线程释放该类,可为下一个线程使用
bool IsAvailable();//判断当前分词器是否被线程占用
unsigned int GetHandle()
return m_nHandle;
unsigned int m_nHandle;//该成员作为该类的Handle值,由系统自动分配,用户不可修改
bool m_bAvailable;//该成员作为多线程共享控制的参数,由系统自动分配,用户不可修改
* Func Name : GetActiveInstance
* Description: 获取可用的CNLPIR类,适用于多线程开发,先获取可用的CNLP,再调用其中的功能
* Parameters : None
* Returns : CNLPIR*
* Author : Kevin Zhang
* History :
* 1.create 1:10:2012
NLPIR_API CNLPIR* GetActiveInstance();
* 以下函数为2013版本专门针对新词发现的过程,一般建议脱机实现,不宜在线处理
* 新词识别完成后,再自动导入到分词系统中,即可完成
* 函数以NLPIR_NWI(New Word Identification)开头
* Func Name : NLPIR_NWI_Start
* Description: 启动新词识别
* Parameters : None
* Returns : bool, true:success, false:fail
* Author : Kevin Zhang
* History :
* 1.create 2012/11/23
NLPIR_API bool NLPIR_NWI_Start();//New Word Indentification Start
* Func Name : NLPIR_NWI_AddFile
* Description: 往新词识别系统中添加待识别新词的文本文件
* 需要在运行NLPIR_NWI_Start()之后,才有效
* Parameters : const char *sFilename:文件名
* Returns : bool, true:success, false:fail
* Author : Kevin Zhang
* History :
* 1.create 2012/11/23
NLPIR_API int NLPIR_NWI_AddFile(const char *sFilename);
* Func Name : NLPIR_NWI_AddMem
* Description: 往新词识别系统中添加一段待识别新词的内存
* 需要在运行NLPIR_NWI_Start()之后,才有效
* Parameters : const char *sFilename:文件名
* Returns : bool, true:success, false:fail
* Author : Kevin Zhang
* History :
* 1.create 2012/11/23
NLPIR_API bool NLPIR_NWI_AddMem(const char *sText);
* Func Name : NLPIR_NWI_Complete
* Description: 新词识别添加内容结束
* 需要在运行NLPIR_NWI_Start()之后,才有效
* Parameters : None
* Returns : bool, true:success, false:fail
* Author : Kevin Zhang
* History :
* 1.create 2012/11/23
NLPIR_API bool NLPIR_NWI_Complete();//新词
* Func Name : NLPIR_NWI_GetResult
* Description: 获取新词识别的结果
* 需要在运行NLPIR_NWI_Complete()之后,才有效
* Parameters : bWeightOut:是否需要输出每个新词的权重参数
* Returns : 输出格式为
* 【新词1】 【权重1】 【新词2】 【权重2】 ...
* Author : Kevin Zhang
* History :
* 1.create 2012/11/23
NLPIR_API const char * NLPIR_NWI_GetResult(bool bWeightOut=false);//输出新词识别结果
* Func Name : NLPIR_NWI_Result2UserDict
* Description: 将新词识别结果导入到用户词典中
* 需要在运行NLPIR_NWI_Complete()之后,才有效
* 如果需要将新词结果永久保存,建议在执行NLPIR_SaveTheUsrDic
* Parameters : None
* Returns : bool, true:success, false:fail
* Author : Kevin Zhang
* History :
* 1.create 2012/11/23
NLPIR_API unsigned int NLPIR_NWI_Result2UserDict();//新词识别结果转为用户词典,返回新词结果数目
#endif//#define __NLPIR_ICTCLAS_2014_H_INCLUDED__
%module NLPIR
#include "NLPIR.h"
#define POS_MAP_NUMBER 4
#define POS_SIZE 40
#define GBK_CODE 0
#define UTF8_CODE GBK_CODE+1
#define BIG5_CODE GBK_CODE+2
int NLPIR_Init(const char * sDataPath=0,int encode=GBK_CODE,const char*sLicenceCode=0);
bool NLPIR_Exit();
const char * NLPIR_ParagraphProcess(const char *sParagraph,int bPOStagged=1);
const result_t * NLPIR_ParagraphProcessA(const char *sParagraph,int *pResultCount,bool bUserDict=true);
int NLPIR_GetParagraphProcessAWordCount(const char *sParagraph);
void NLPIR_ParagraphProcessAW(int nCount,result_t * result);
double NLPIR_FileProcess(const char *sSourceFilename,const char *sResultFilename,int bPOStagged=1);
unsigned int NLPIR_ImportUserDict(const char *sFilename);
int NLPIR_AddUserWord(const char *sWord);
int NLPIR_SaveTheUsrDic();
int NLPIR_DelUsrWord(const char *sWord);
double NLPIR_GetUniProb(const char *sWord);
bool NLPIR_IsWord(const char *sWord);
const char * NLPIR_GetKeyWords(const char *sLine,int nMaxKeyLimit=50,bool bWeightOut=false);
const char * NLPIR_GetFileKeyWords(const char *sFilename,int nMaxKeyLimit=50,bool bWeightOut=false);
const char * NLPIR_GetNewWords(const char *sLine,int nMaxKeyLimit=50,bool bWeightOut=false);
const char * NLPIR_GetFileNewWords(const char *sFilename,int nMaxKeyLimit=50,bool bWeightOut=false);
unsigned long NLPIR_FingerPrint(const char *sLine);
int NLPIR_SetPOSmap(int nPOSmap);
CNLPIR* GetActiveInstance();
bool NLPIR_NWI_Start();
int NLPIR_NWI_AddFile(const char *sFilename);
bool NLPIR_NWI_AddMem(const char *sText);
bool NLPIR_NWI_Complete();
const char * NLPIR_NWI_GetResult(bool bWeightOut=false);
unsigned int NLPIR_NWI_Result2UserDict();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment