Skip to content

Instantly share code, notes, and snippets.

@timvw
Created August 14, 2011 04:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timvw/1144594 to your computer and use it in GitHub Desktop.
Save timvw/1144594 to your computer and use it in GitHub Desktop.
WITH
[Nums1] AS (SELECT 1 AS [Value] UNION SELECT 2 AS [Value])
, [Nums2] AS (SELECT A.* FROM [Nums1] AS A, [Nums1] AS B, [Nums1] AS C)
, [Nums3] AS (SELECT A.* FROM [Nums2] AS A, [Nums2] AS B, [Nums2] AS C)
, [Nums4] AS (SELECT A.* FROM [Nums3] AS A, [Nums3] AS B)
, [Nums] AS (SELECT TOP(1000) ROW_NUMBER() OVER(ORDER BY (SELECT 0)) AS [n] FROM[Nums4])
, [Data] AS (SELECT 1 AS [id], N'The upcoming release of SQL Server is codenamed Denali. It is also called SQL11 which refers to version 11 and peple misunderstand it to be SQL Server 2011.' AS [comment] UNION SELECT 2 AS [id], N'SQL Server Denali CTP 3 introduced a number of TSQL enhancements.' AS [comment])
, [Sentences] AS (
SELECT [id]
, [comment]
, LTRIM(SUBSTRING([comment], MIN([n]), MAX([n]) - MIN([n]))) AS [data]
, ROW_NUMBER() OVER(PARTITION BY [id], [comment] ORDER BY MIN([n])) AS [sentence]
FROM [Data]
CROSS JOIN [Nums]
WHERE [n] <= LEN([comment])
GROUP BY [id], [comment], CHARINDEX('.', [comment], [n])
)
, [Words] AS (
SELECT [id]
, [comment]
, [sentence]
, [data]
, MIN([n]) AS [begin]
, MAX([n]) AS [end]
, SUBSTRING([data]
, MIN([n]), MAX([n]) - MIN([n])) AS [part]
, ROW_NUMBER() OVER(PARTITION BY [id], [comment], [sentence], [data] ORDER BY MIN([n])) AS [WordNumber]
FROM [Sentences]
CROSS JOIN [Nums]
WHERE [n] <= LEN([data]) + 1
GROUP BY [id], [comment], [sentence], [data], CHARINDEX(' ', [data], [n])
)
, [WordGroups] AS (
SELECT [id]
, [comment]
, [sentence]
, [data]
, MIN([begin]) AS [begin]
, MAX([end]) AS [end]
, SUBSTRING([data], MIN([begin]), MAX([end]) - MIN([begin])) AS [text]
FROM [Words]
GROUP BY [id], [comment], [sentence], [data], ([WordNumber] - 1) / 5
)
, [NumberedWordGroups] AS (
SELECT [id]
, [sentence]
, RANK() OVER(PARTITION BY [id], [sentence], [data] ORDER BY [begin]) AS [group]
, [text]
FROM [WordGroups]
)
SELECT *
FROM [NumberedWordGroups]
ORDER BY [id], [sentence];
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment