Skip to content

Instantly share code, notes, and snippets.

@yssymmt
Created September 19, 2022 16:41
Show Gist options
  • Save yssymmt/9139c0452e41b3f486ad9b1b2dec65f1 to your computer and use it in GitHub Desktop.
Save yssymmt/9139c0452e41b3f486ad9b1b2dec65f1 to your computer and use it in GitHub Desktop.
/*CLOBでのテーブル*/
drop table jumbo.aud04_mecab;
create multiset table jumbo.aud04_mecab (
docid integer,
cat varchar(10) character set unicode,
docdesc clob character set unicode
) primary index (docid)
;
/*最大文字数の確認: 32000以上であれば分ける必要*/
select
max(char_length(docdesc)) as 最大文字数
from jumbo.aud04_mecab
;
/*いったん30000文字単位にしたテーブルを持つ*/
drop table jumbo.aud04_mecab_temp;
create table jumbo.aud04_mecab_temp (
superdocid integer,
docdesc varchar(30000) character set unicode
) primary index(superdocid)
;
insert into jumbo.aud04_mecab_temp
select
docid*10 + subdocid as superdocid,
docdesc
from (
/*3万文字単位で縦に分割、仮に8万文字なら3つをUnion*/
select
docid,
cast(1 as integer) as subdocid,
cast(substring(docdesc from 1 for 30000) as varchar(30000)) as docdesc
from jumbo.aud04_mecab
union
select
docid,
cast(2 as integer) as subdocid,
cast(substring(docdesc from 30001 for 30000) as varchar(30000)) as docdesc
from jumbo.aud04_mecab
) a0
where docdesc <>''
;
/*縦持ちと横持ちの分解*/
drop table jumbo.aud06_mecab_vertical;
create table jumbo.aud06_mecab_vertical (
docid integer,
seqno integer,
word varchar(10) character set unicode,
pos varchar(10) character set unicode,
norm varchar(10) character set unicode
) primary index(docid)
;
insert into jumbo.aud06_mecab_vertical
with src as (
/*改行コードで別行にする*/
select
outkey as superdocid,
token_nbr as seqno,
result_string
from table (
regexp_split_to_table
(jumbo.aud04_mecab_temp.superdocid, jumbo.aud04_mecab_temp.docdesc, '\n','c')
returns (outkey integer, token_nbr integer, result_string varchar(30000) character set unicode)
) a1
)
, joint as (
/* select max(char_length(result_string)) as aaa from src */
select
cast(cast(docid as float)/10 as integer) as docid,
cast(row_number() over(partition by docid order by rnthruout) as integer) as seqno,
case when subdocid>1 and seqno=1 then cast(aaa||result_string as varchar(300)) else cast(result_string as varchar(300)) end as result_string
from (
select
docid,
subdocid,
seqno,
rndesc,
rnthruout,
maxsubdocid,
lag(result_string,1,null) over(partition by docid order by rnthruout) as aaa,
result_string
from (
/*結合用にもろもろの順序を決定*/
select
trunc(superdocid, -1) as docid,
mod(superdocid, 10) as subdocid,
seqno,
row_number() over(partition by docid, subdocid order by seqno desc) as rndesc,
row_number() over(partition by docid order by subdocid, seqno) as rnthruout,
max(subdocid) over(partition by docid) as maxsubdocid,
cast(result_string as varchar(150)) as result_string
from src
) a2
) a3
where not (maxsubdocid>subdocid and rndesc=1)
)
/*タブとカンマで列分解する*/
select
docid,
seqno,
strtok(result_string, ' ',1) as word,
strtok((strtok(result_string, ' ', 2)), ',', 1) as pos,
/*strtok(result_string, ',', 3) as n3, */
/*strtok(result_string, ',', 4) as n4, */
/*strtok(result_string, ',', 5) as n5, */
/*strtok(result_string, ',', 6) as n6, */
strtok(result_string, ',', 7) as norm
/*strtok(result_string, ',', 8) as n8, */
/*strtok(result_string, ',', 9) as n9, */
from joint
where result_string<>'EOS'
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment