Skip to content

Instantly share code, notes, and snippets.

@LinZap
Last active February 6, 2017 07:20
Show Gist options
  • Save LinZap/c2acb94c601cc1e7cb91c6173a4eeb44 to your computer and use it in GitHub Desktop.
Save LinZap/c2acb94c601cc1e7cb91c6173a4eeb44 to your computer and use it in GitHub Desktop.
Search Engine

Search Engine

for I3S Schema Search Engine

Table

Token  

字段表

create table Token
(
	words nvarchar(100) not null,
	tid int identity(1,1) not null,
	interfreq int not null default(0),
	intrafreq int not null default(1),
	constraint PK_Token primary key clustered (words),
	constraint UQ_Token_TID unique(tid)
)

InvIndex  

反索引表

create table InvIndex
(
	tid int not null,
	oid int not null,
	[weight] int not null default(1),
	[field] binary(8),
	constraint PK_InvIndex primary key clustered (tid,oid),
	constraint FK_InvIndex_TID foreign key(tid) references Token(tid)
)

RawIndex  

正索引表  

create table RawIndex
(
	oid int not null,
	tid int not null,
	constraint PK_RawIndex primary key clustered (oid,tid),
	constraint FK_RawIndex_TID foreign key(tid) references Token(tid)
)

Trigger  

Trg_Index_insert    

新增到反索引表的觸發,會自動新增到正索引表,並自動計算 weightinterfreq

create trigger Trg_Index_insert on InvIndex INSTEAD OF INSERT 
as 
	declare @oid int
	declare @tid int
	declare @field int

	declare index_cursor cursor for select oid,tid,field from inserted
	open index_cursor fetch next from index_cursor into @oid,@tid,@field
	while @@FETCH_STATUS=0
	begin
		if exists(select * from InvIndex where tid=@tid and oid=@oid)
			update InvIndex set [weight]=[weight]+1,field=field|@field
			where tid = @tid and oid = @oid
		else 
		begin
			insert into InvIndex values(@tid,@oid,1,@field)
			insert into RawIndex values(@oid,@tid)
			update Token set interfreq=interfreq+1 where tid = @tid
		end
		fetch next from index_cursor into @oid,@tid,@field
	end
	close index_cursor
	deallocate index_cursor

Function

fn_toInt  

將二進位表示的字串轉成數字,方便 field 使用

create function fn_toInt (@binary nvarchar(31)) returns int
as 
begin
	declare @res int = 0
	declare @i int=len(@binary)
	declare @pow int = 0
	declare @c char(1)
	while(@i>0)
	begin
		set @c = substring(@binary,@i,1)
		if @c = '1'
			set @res += power(2, @pow)	
		set @pow += 1
		set @i -= 1
	end
	return(@res)
end

fn_fulltoken  

進行 sliding window 並以 table 的形式回傳,這裡的 ngram 寫死是 5

create function fn_fulltoken(@text nvarchar(4000))
returns @token table(
	words nvarchar(4000)
)
as 
begin 	
	declare @i int = 1
	declare @len int = len(@text)
	declare @gram int
	declare @gram int = 5	
	while(@gram>0)
	begin
		set @i = 1
		while((@i+@gram-1)<=@len)
		begin 
			insert into @token values(SUBSTRING(@text,@i,@gram))
			set @i=@i+1
		end
		set @gram = @gram-1
	end
	return;
end

Stored Procedure

xp_insertToken  

將字段寫入 Token 表中,並自動建立其索引 InvIndex  

create procedure xp_insertToken
@words nvarchar(4000),
@oid int,
@field nvarchar(31)
as 
begin
	declare @tid int
	declare @tids table(tid int)
	select @tid=tid from token where words=@words
	if @tid is not null
		update token set intrafreq = intrafreq+1 where words=@words
	else 
	begin
		insert into token(words) output inserted.tid into @tids values(@words)
		select @tid=tid from @tids
	end
	insert into InvIndex(tid,oid,field) values(@tid,@oid,dbo.fn_toInt(@field))
end

xp_insertFullToken

將字段先進行 sliding window (fn_fulltoken),再藉由 xp_insertToken 寫入 Token

create procedure xp_insertFullToken
@segment nvarchar(4000),
@oid int,
@field nvarchar(31)
as
begin
	declare @tid int
	declare @words nvarchar(4000)
	declare token_cursor cursor for select words from dbo.fn_fulltoken(@segment)
	open token_cursor fetch next from token_cursor into @words 
	
	while @@FETCH_STATUS = 0  
	begin
		exec dbo.xp_insertToken @words,@oid,@field
		fetch next from token_cursor into @words  
	end
	close token_cursor
	deallocate token_cursor
end

xp_tokenize    

進行索引處理,將字串去除特殊符號後,做中、英數分離。   中文部分做 slideing window後逐一進入索引,
英數部分直接進入索引

create procedure xp_tokenize 
@oid int,
@text nvarchar(4000),
@field nvarchar(31)
as 
begin
	declare @len int = len(@text)
	declare @i int = 1
	declare @mode int
	declare @prevmode int
	declare @tmpChar nvarchar(2)
	declare @tmpStr nvarchar(4000)

	while(@i<=@len)
		begin
			set @tmpChar=lower(substring(@text,@i,1))
			set @i+=1
			-- mode:1 英數 
			if @tmpChar like '%[-a-Z0-9_'']%'  set @mode=1
			-- mode:2 漢字 
			else if unicode(@tmpChar) between 19968 and 40917  set @mode=2
			-- mode:0 其他
			else  set @mode=0

			-- 跟上一個字一樣 mode
			if @prevmode = @mode 
				set @tmpStr+=@tmpChar
			else
			-- 跟上一個字不同 mode
			begin
				-- 暫存的是 mode 1 or 2 才是要的
				if @prevmode=1
					exec dbo.xp_insertToken @tmpStr,@oid,@field
				if @prevmode=2
					exec dbo.xp_insertFullToken @tmpStr,@oid,@field		
				set @tmpStr = @tmpChar
				set @prevmode = @mode
			end		
		end

		-- 暫存裡面清出
		if len(@tmpStr)>0
		begin
			if @prevmode=1
				exec dbo.xp_insertToken @tmpStr,@oid,@field
			if @prevmode=2
				exec dbo.xp_insertFullToken @tmpStr,@oid,@field
		end
		return;
end

Usage

xp_tokenize 作為起點

  exec xp_tokenize @oid,@text,@field
  -- e.g. exec xp_tokenize 1,'好棒棒好棒棒我好怕','00001'

如此便能全自動建立索引

@LinZap
Copy link
Author

LinZap commented Feb 6, 2017

Unindexed

xp_unindexed

移除某一個 oid 的索引

create procedure xp_unindexed
@oid int
as
begin
	update Token set intrafreq=intrafreq-d.[weight],interfreq=interfreq-1
	from InvIndex d where d.tid=Token.tid and d.oid=@oid
	delete from InvIndex from RawIndex r where 
	r.tid=InvIndex.tid and r.oid=InvIndex.oid and r.oid=@oid
	delete from RawIndex where oid=@oid
	delete from Token where intrafreq=0
end

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment