Created
September 19, 2022 16:41
-
-
Save yssymmt/9139c0452e41b3f486ad9b1b2dec65f1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*CLOBでのテーブル*/ | |
drop table jumbo.aud04_mecab; | |
create multiset table jumbo.aud04_mecab ( | |
docid integer, | |
cat varchar(10) character set unicode, | |
docdesc clob character set unicode | |
) primary index (docid) | |
; | |
/*最大文字数の確認: 32000以上であれば分ける必要*/ | |
select | |
max(char_length(docdesc)) as 最大文字数 | |
from jumbo.aud04_mecab | |
; | |
/*いったん30000文字単位にしたテーブルを持つ*/ | |
drop table jumbo.aud04_mecab_temp; | |
create table jumbo.aud04_mecab_temp ( | |
superdocid integer, | |
docdesc varchar(30000) character set unicode | |
) primary index(superdocid) | |
; | |
insert into jumbo.aud04_mecab_temp | |
select | |
docid*10 + subdocid as superdocid, | |
docdesc | |
from ( | |
/*3万文字単位で縦に分割、仮に8万文字なら3つをUnion*/ | |
select | |
docid, | |
cast(1 as integer) as subdocid, | |
cast(substring(docdesc from 1 for 30000) as varchar(30000)) as docdesc | |
from jumbo.aud04_mecab | |
union | |
select | |
docid, | |
cast(2 as integer) as subdocid, | |
cast(substring(docdesc from 30001 for 30000) as varchar(30000)) as docdesc | |
from jumbo.aud04_mecab | |
) a0 | |
where docdesc <>'' | |
; | |
/*縦持ちと横持ちの分解*/ | |
drop table jumbo.aud06_mecab_vertical; | |
create table jumbo.aud06_mecab_vertical ( | |
docid integer, | |
seqno integer, | |
word varchar(10) character set unicode, | |
pos varchar(10) character set unicode, | |
norm varchar(10) character set unicode | |
) primary index(docid) | |
; | |
insert into jumbo.aud06_mecab_vertical | |
with src as ( | |
/*改行コードで別行にする*/ | |
select | |
outkey as superdocid, | |
token_nbr as seqno, | |
result_string | |
from table ( | |
regexp_split_to_table | |
(jumbo.aud04_mecab_temp.superdocid, jumbo.aud04_mecab_temp.docdesc, '\n','c') | |
returns (outkey integer, token_nbr integer, result_string varchar(30000) character set unicode) | |
) a1 | |
) | |
, joint as ( | |
/* select max(char_length(result_string)) as aaa from src */ | |
select | |
cast(cast(docid as float)/10 as integer) as docid, | |
cast(row_number() over(partition by docid order by rnthruout) as integer) as seqno, | |
case when subdocid>1 and seqno=1 then cast(aaa||result_string as varchar(300)) else cast(result_string as varchar(300)) end as result_string | |
from ( | |
select | |
docid, | |
subdocid, | |
seqno, | |
rndesc, | |
rnthruout, | |
maxsubdocid, | |
lag(result_string,1,null) over(partition by docid order by rnthruout) as aaa, | |
result_string | |
from ( | |
/*結合用にもろもろの順序を決定*/ | |
select | |
trunc(superdocid, -1) as docid, | |
mod(superdocid, 10) as subdocid, | |
seqno, | |
row_number() over(partition by docid, subdocid order by seqno desc) as rndesc, | |
row_number() over(partition by docid order by subdocid, seqno) as rnthruout, | |
max(subdocid) over(partition by docid) as maxsubdocid, | |
cast(result_string as varchar(150)) as result_string | |
from src | |
) a2 | |
) a3 | |
where not (maxsubdocid>subdocid and rndesc=1) | |
) | |
/*タブとカンマで列分解する*/ | |
select | |
docid, | |
seqno, | |
strtok(result_string, ' ',1) as word, | |
strtok((strtok(result_string, ' ', 2)), ',', 1) as pos, | |
/*strtok(result_string, ',', 3) as n3, */ | |
/*strtok(result_string, ',', 4) as n4, */ | |
/*strtok(result_string, ',', 5) as n5, */ | |
/*strtok(result_string, ',', 6) as n6, */ | |
strtok(result_string, ',', 7) as norm | |
/*strtok(result_string, ',', 8) as n8, */ | |
/*strtok(result_string, ',', 9) as n9, */ | |
from joint | |
where result_string<>'EOS' | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment