Created
December 20, 2017 16:12
-
-
Save statgeek/eb8228cdf07cb3f3ee5f1494e4c3b10f to your computer and use it in GitHub Desktop.
SAS - take a sentence, split into individual words and find all two word combinations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*Create sample data; | |
data random_sentences; | |
infile cards truncover; | |
informat sentence $256.; | |
input sentence $256.; | |
cards; | |
This is a random sentence | |
This is another random sentence | |
Happy Birthday | |
My job sucks. | |
This is a good idea, not. | |
This is an awesome idea! | |
How are you today? | |
Does this make sense? | |
Have a great day! | |
; | |
; | |
; | |
; | |
*Partition into words; | |
data f1; | |
set random_sentences; | |
id=_n_; | |
nwords=countw(sentence); | |
nchar=length(compress(sentence)); | |
do word_order=1 to nwords; | |
word=scan(sentence, word_order); | |
output; | |
end; | |
run; | |
proc sql; | |
create table words2 as | |
select t1.sentence, lowcase(t1.word) as word1, lowcase(t2.word) as word2 | |
from f1 as t1 | |
cross join f1 as t2 | |
where t1.sentence=t2.sentence | |
and t1.word_order > t2.word_order | |
order by t1.sentence, t1.word_order; | |
quit; | |
proc freq data=words2 noprint order=freq; | |
table word1*word2 /list out=want; | |
run; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment