Created
July 25, 2014 08:21
-
-
Save xtender/129c0b9c1998841d2896 to your computer and use it in GitHub Desktop.
some string splitting techniques
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- 1. xmltable-string-tokenize: | |
with v as ( | |
select 1 id, 'Hello Brendan How are you today?' str from dual | |
union all | |
select 2 id, 'It is a hot and sunny day in Ireland.' from dual | |
union all | |
select 3 id, '$$$It is a hot and sunny day in #Ireland.' from dual | |
union all | |
select 4 id, '#####' from dual | |
) | |
select * | |
from v | |
,xmltable('ora:tokenize(.,$delim)[.]' | |
passing v.str, '\W+' as "delim" | |
columns | |
n for ordinality | |
,token varchar2(4000) path '.' | |
)(+)-- remove "(+)" - if you don't want to return strings without tokens. | |
/ | |
-- 2. table/lateral + pivot_query(connect by/model, etc.) | |
with v as ( | |
select 1 id, 'Hello Brendan How are you today?' str from dual | |
union all | |
select 2 id, 'It is a hot and sunny day in Ireland.' from dual | |
union all | |
select 3 id, '$$$It is a hot and sunny day in #Ireland.' from dual | |
union all | |
select 4 id, '#####' from dual | |
) | |
select v.id,v.str | |
,v_lateral.n | |
,regexp_substr(v.str,'\w+',1,v_lateral.n) token | |
from v | |
left join (select level n from dual connect by level<=2000) v_lateral | |
on v_lateral.n <= regexp_count(v.str,'\w+') | |
order by id,n | |
/ | |
-- 3.1 model: | |
with v as ( | |
select 1 id, 'Hello Brendan How are you today?' str from dual | |
union all | |
select 2 id, 'It is a hot and sunny day in Ireland.' from dual | |
union all | |
select 3 id, '$$$It is a hot and sunny day in #Ireland.' from dual | |
union all | |
select 4 id, '#####' from dual | |
) | |
select id,str,n,token | |
from v | |
model | |
partition by (id) | |
dimension by (1 n) | |
measures (str, cast(null as varchar2(4000)) token, nvl(regexp_count(str,'\w+'),0) tokens ) | |
rules | |
( | |
token[for n from 1 to tokens[1] increment 1] = regexp_substr(str[1],'\w+',1,cv(n)) | |
,str [for n from 1 to tokens[1] increment 1] = str[1] | |
) | |
order by id,n | |
/ | |
-- 3.2 another model: | |
with v as ( | |
select 1 id, 'Hello Brendan How are you today?' str from dual | |
union all | |
select 2 id, 'It is a hot and sunny day in Ireland.' from dual | |
union all | |
select 3 id, '$$$It is a hot and sunny day in #Ireland.' from dual | |
union all | |
select 4 id, '#####' from dual | |
) | |
select id,str,n,token | |
from v | |
model | |
partition by (id) | |
dimension by (1 n) | |
measures (str, cast(null as varchar2(4000)) token, nvl(regexp_count(str,'\w+'),0) tokens) | |
rules iterate(2000) until (iteration_number+1 >= tokens[1]) | |
( | |
token[iteration_number+1] = regexp_substr(str[1],'\w+',1,cv(n)) | |
,str [iteration_number+1] = str[1] | |
) | |
order by id,n | |
/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment