Skip to content

Instantly share code, notes, and snippets.

@arvindshmicrosoft
Last active September 20, 2017 10:33
Show Gist options
  • Save arvindshmicrosoft/ae25112b146ce276f375c8908d780fe8 to your computer and use it in GitHub Desktop.
Save arvindshmicrosoft/ae25112b146ce276f375c8908d780fe8 to your computer and use it in GitHub Desktop.
MillionSongDatasetinSQLServer
CREATE DATABASE [MillionSongDataset]
GO
ALTER DATABASE [MillionSongDataset] SET RECOVERY SIMPLE
GO
USE [MillionSongDataset]
GO
CREATE TABLE [dbo].[train_triplets](
[UserId] [varchar](80) NULL,
[SongId] [varchar](36) NULL,
[ListenCount] [bigint] NULL,
INDEX CCI_train_triplets CLUSTERED COLUMNSTORE
)
GO
CREATE TABLE [dbo].[unique_tracks](
[TrackId] [varchar](50) NULL,
[SongId] [varchar](50) NULL,
[ArtistName] [nvarchar](500) NULL,
[SongTitle] [nvarchar](500) NULL,
INDEX CCI_unique_tracks CLUSTERED COLUMNSTORE
)
GO
/*
Importing and using the Million Song Dataset (https://labrosa.ee.columbia.edu/millionsong/) in SQL Server 2017
Citation:
Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, and Paul Lamere.
The Million Song Dataset. In Proceedings of the 12th International Society
for Music Information Retrieval Conference (ISMIR 2011), 2011.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
This sample code is not supported under any Microsoft standard support program or service.
The entire risk arising out of the use or performance of the sample scripts and documentation remains with you.
In no event shall Microsoft, its authors, or anyone else involved in the creation, production, or delivery of the scripts
be liable for any damages whatsoever (including, without limitation, damages for loss of business profits,
business interruption, loss of business information, or other pecuniary loss) arising out of the use of or inability
to use the sample scripts or documentation, even if Microsoft has been advised of the possibility of such damages.
*/
USE MillionSongDataset
GO
-- Reference: https://labrosa.ee.columbia.edu/millionsong/pages/getting-dataset
-- Data file: http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/unique_tracks.txt
-- An interesting thing to note here is the usage of code page 65001, which is UTF-8. The unique_tracks.txt file is in UTF-8 format.
-- See https://support.microsoft.com/en-us/help/3136780/utf-8-encoding-support-for-the-bcp-utility-and-bulk-insert-transact-sql-command-in-sql-server-2014-sp2 for a description of UTF-8 support in SQL
INSERT dbo.unique_tracks
SELECT TrackId, SongId, ArtistName, SongTitle
FROM OPENROWSET(BULK 'C:\MSD\unique_tracks.txt',
FORMATFILE = 'C:\MSD\echonesttracks.xml',
CODEPAGE = '65001') AS RawData
GO
-- Reference: https://labrosa.ee.columbia.edu/millionsong/tasteprofile
-- Data file (unzip manually please!): http://labrosa.ee.columbia.edu/millionsong/sites/default/files/challenge/train_triplets.txt.zip
INSERT dbo.[train_triplets]
SELECT UserId, SongId, ListenCount
FROM OPENROWSET(BULK 'C:\MSD\train_triplets.txt',
FORMATFILE = 'C:\MSD\train_triplets.xml') AS RawData
GO
-- Fix up errors in the Echo Nest data
-- References:
-- https://labrosa.ee.columbia.edu/millionsong/blog/12-1-2-matching-errors-taste-profile-and-msd
-- https://labrosa.ee.columbia.edu/millionsong/blog/12-2-12-fixing-matching-errors
-- Data file:
-- http://labrosa.ee.columbia.edu/millionsong/sites/default/files/tasteprofile/sid_mismatches.txt
UPDATE orig
SET orig.SongTitle = final.ActualSong,
orig.ArtistName = final.ActualArtist
FROM
unique_tracks orig
JOIN (
SELECT Error, SongId, TrackId, ActualArtist, ActualSong, OriginalArtist, OriginalSong
FROM
OPENROWSET(BULK 'C:\MSD\sid_mismatches.txt',
FORMATFILE = 'C:\MSD\sid_mismatches.xml', CODEPAGE = '65001') AS MismatchedSongs
) AS final
ON orig.SongId = final.SongId
WHERE ActualArtist IS NOT NULL
AND ActualSong IS NOT NULL
GO
/*
Importing and using the Million Song Dataset (https://labrosa.ee.columbia.edu/millionsong/) in SQL Server 2017
Citation:
Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, and Paul Lamere.
The Million Song Dataset. In Proceedings of the 12th International Society
for Music Information Retrieval Conference (ISMIR 2011), 2011.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
This sample code is not supported under any Microsoft standard support program or service.
The entire risk arising out of the use or performance of the sample scripts and documentation remains with you.
In no event shall Microsoft, its authors, or anyone else involved in the creation, production, or delivery of the scripts
be liable for any damages whatsoever (including, without limitation, damages for loss of business profits,
business interruption, loss of business information, or other pecuniary loss) arising out of the use of or inability
to use the sample scripts or documentation, even if Microsoft has been advised of the possibility of such damages.
*/
USE MillionSongDataset
GO
-- This (node) table holds the main details of the song
CREATE TABLE UniqueSong
(
SongId VARCHAR(50),
SongTitle VARCHAR(500),
ArtistName NVARCHAR(500)
)
AS Node
GO
-- This (node) table holds the user IDs. We do not have (nor do we need, for this demo) any other details about the user
CREATE TABLE UniqueUser
(
UserId VARCHAR(80)
)
AS node
GO
-- This edge table stores the relationships - see the 'ConvertData' script on how the 'from' (user) and 'to' (song) are specified,
-- and an attribute (ListenCount) on the edge which stores the number of times the user listened to this song.
CREATE TABLE Likes
(
ListenCount BIGINT
)
AS Edge
GO
/*
Importing and using the Million Song Dataset (https://labrosa.ee.columbia.edu/millionsong/) in SQL Server 2017
Citation:
Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, and Paul Lamere.
The Million Song Dataset. In Proceedings of the 12th International Society
for Music Information Retrieval Conference (ISMIR 2011), 2011.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
This sample code is not supported under any Microsoft standard support program or service.
The entire risk arising out of the use or performance of the sample scripts and documentation remains with you.
In no event shall Microsoft, its authors, or anyone else involved in the creation, production, or delivery of the scripts
be liable for any damages whatsoever (including, without limitation, damages for loss of business profits,
business interruption, loss of business information, or other pecuniary loss) arising out of the use of or inability
to use the sample scripts or documentation, even if Microsoft has been advised of the possibility of such damages.
*/
Use MillionSongDataset;
GO
-- This step inserts the songs into the Node table (UniqueSong), which internally generates a unique $node_id pseudo-column per row
INSERT UniqueSong (SongId, SongTitle, ArtistName)
SELECT DISTINCT
SongId,
SongTitle,
ArtistName
FROM unique_tracks
GO
-- This step inserts the users into the Node table (UniqueUser), which internally generates a unique $node_id pseudo-column per row
INSERT UniqueUser (UserId)
SELECT DISTINCT UserId
FROM dbo.train_triplets
GO
-- This step inserts the from and to relationships between users and songs. To do this, it uses the previously generated $node_id values
-- for each user and each song, and then inserts those along with the ListenCount for that relationship into the Likes table
-- Note the use of $from_id and $to_id. These are pseudo-columns which allow the edge table to track relationships correctly.
INSERT Likes
($from_id, $to_id, ListenCount)
SELECT
U.$node_id,
S.$node_id,
T.ListenCount
FROM
dbo.train_triplets T
JOIN UniqueUser U ON U.UserId = T.UserId
JOIN UniqueSong S ON T.SongId = S.SongId
GO
-- let's sample the data now!
-- Note the $node_id column (represented as a JSON string) in the output
SELECT TOP (10) *
FROM UniqueUser;
-- Note the $node_id column (represented as a JSON string) in the output
SELECT TOP (10) *
FROM UniqueSong;
-- Note the $edge_id, $from_id, $to_id pseudo-columns (each represented as a JSON string) in the output
SELECT TOP (10) *
FROM Likes;
/*
Importing and using the Million Song Dataset (https://labrosa.ee.columbia.edu/millionsong/) in SQL Server 2017
Citation:
Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, and Paul Lamere.
The Million Song Dataset. In Proceedings of the 12th International Society
for Music Information Retrieval Conference (ISMIR 2011), 2011.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
This sample code is not supported under any Microsoft standard support program or service.
The entire risk arising out of the use or performance of the sample scripts and documentation remains with you.
In no event shall Microsoft, its authors, or anyone else involved in the creation, production, or delivery of the scripts
be liable for any damages whatsoever (including, without limitation, damages for loss of business profits,
business interruption, loss of business information, or other pecuniary loss) arising out of the use of or inability
to use the sample scripts or documentation, even if Microsoft has been advised of the possibility of such damages.
*/
USE MillionSongDataset
GO
SET STATISTICS io ON
GO
SET STATISTICS time ON
GO
-- Find songs which are similar to 'Just Dance' by Lady Gaga!
-- This query takes around 3 seconds to complete on a laptop with an i7 processor
-- For tuning this query performance, create clustered columnstore indexes! - see query 5_ImprovePerf.sql
SELECT
TOP 10
SimilarSong.SongTitle,
COUNT(*)
FROM
UniqueSong AS MySong,
UniqueUser AS U,
Likes AS LikesOther,
Likes AS LikesThis,
UniqueSong AS SimilarSong
WHERE MySong.SongTitle LIKE 'Just Dance'
AND MATCH(SimilarSong<-(LikesOther)-U-(LikesThis)->MySong)
GROUP BY SimilarSong.SongTitle
ORDER BY COUNT(*) DESC
SET STATISTICS time OFF
GO
SET STATISTICS IO OFF
GO
/*
Importing and using the Million Song Dataset (https://labrosa.ee.columbia.edu/millionsong/) in SQL Server 2017
Citation:
Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, and Paul Lamere.
The Million Song Dataset. In Proceedings of the 12th International Society
for Music Information Retrieval Conference (ISMIR 2011), 2011.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
This sample code is not supported under any Microsoft standard support program or service.
The entire risk arising out of the use or performance of the sample scripts and documentation remains with you.
In no event shall Microsoft, its authors, or anyone else involved in the creation, production, or delivery of the scripts
be liable for any damages whatsoever (including, without limitation, damages for loss of business profits,
business interruption, loss of business information, or other pecuniary loss) arising out of the use of or inability
to use the sample scripts or documentation, even if Microsoft has been advised of the possibility of such damages.
*/
USE MillionSongDataset
GO
CREATE CLUSTERED columnstore INDEX cci_songs
ON UniqueSong;
GO
CREATE CLUSTERED columnstore INDEX cci_users
ON UniqueUser;
GO
CREATE CLUSTERED columnstore INDEX cci_likes
ON Likes;
GO
/*
Importing and using the Million Song Dataset (https://labrosa.ee.columbia.edu/millionsong/) in SQL Server 2017
Citation:
Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, and Paul Lamere.
The Million Song Dataset. In Proceedings of the 12th International Society
for Music Information Retrieval Conference (ISMIR 2011), 2011.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
This sample code is not supported under any Microsoft standard support program or service.
The entire risk arising out of the use or performance of the sample scripts and documentation remains with you.
In no event shall Microsoft, its authors, or anyone else involved in the creation, production, or delivery of the scripts
be liable for any damages whatsoever (including, without limitation, damages for loss of business profits,
business interruption, loss of business information, or other pecuniary loss) arising out of the use of or inability
to use the sample scripts or documentation, even if Microsoft has been advised of the possibility of such damages.
*/
Use MillionSongDataset
GO
-- Here, we plot a 'relationship graph' for users who heard the song 'Kryptonite'.
-- The resultant PNG file shows a graph with users who heard this song, and also plots other songs they listened to.
-- To do this we are using SQL Server R Services and the 'igraph' package in R to render the graph as a PNG file
-- As a pre-requisite, you must have installed the 'igraph' package in R.exe
-- The documentation at https://docs.microsoft.com/en-us/sql/advanced-analytics/r/install-additional-r-packages-on-sql-server explains how to do this
-- Note: for SQL Server 2017, you must modify the library path to use MSSQL14 instead of the MSSQL13 specified in the documentation
exec sp_execute_external_script @language = N'R',
@script = N'
require(igraph)
g <- graph.data.frame(graphdf)
V(g)$label.cex <- 2
png(filename = "c:\\MSD\\plot1.png", height = 6000, width = 6000, res = 100);
plot(g, vertex.label.family = "sans", vertex.size = 5)
dev.off()
',
@input_data_1 = N'select distinct LEFT(UserId, 5) as UserId, LEFT(REPLACE(ArtistName, '' '', ''''), 15) as ArtistName
from
(
select TOP 500 U.UserId, SimilarSong.ArtistName, ROW_NUMBER() OVER(PARTITION BY UserId ORDER BY LikesOther.ListenCount desc) as RowNum
from UniqueSong as MySong,
UniqueUser as U,
Likes as LikesOther,
Likes as LikesThis,
UniqueSong as SimilarSong
where MySong.SongTitle = ''Kryptonite''
and MATCH(SimilarSong<-(LikesOther)-U-(LikesThis)->MySong)
) as InnerTable
where RowNum <= 20
order by UserId',
@input_data_1_name = N'graphdf'
GO
/*
Importing and using the Million Song Dataset (https://labrosa.ee.columbia.edu/millionsong/) in SQL Server 2017
Citation:
Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, and Paul Lamere.
The Million Song Dataset. In Proceedings of the 12th International Society
for Music Information Retrieval Conference (ISMIR 2011), 2011.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
This sample code is not supported under any Microsoft standard support program or service.
The entire risk arising out of the use or performance of the sample scripts and documentation remains with you.
In no event shall Microsoft, its authors, or anyone else involved in the creation, production, or delivery of the scripts
be liable for any damages whatsoever (including, without limitation, damages for loss of business profits,
business interruption, loss of business information, or other pecuniary loss) arising out of the use of or inability
to use the sample scripts or documentation, even if Microsoft has been advised of the possibility of such damages.
*/
<?xml version="1.0"?>
<BCPFORMAT xmlns="http://schemas.microsoft.com/sqlserver/2004/bulkload/format" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<RECORD>
<FIELD ID="1" xsi:type="CharTerm" TERMINATOR="&lt;SEP&gt;" MAX_LENGTH="100"/>
<FIELD ID="2" xsi:type="CharTerm" TERMINATOR="&lt;SEP&gt;" MAX_LENGTH="100"/>
<FIELD ID="3" xsi:type="CharTerm" TERMINATOR="&lt;SEP&gt;" MAX_LENGTH="500"/>
<FIELD ID="4" xsi:type="CharTerm" TERMINATOR="\n" MAX_LENGTH="500"/>
</RECORD>
<ROW>
<COLUMN SOURCE="1" NAME="TrackId" xsi:type="SQLVARYCHAR"/>
<COLUMN SOURCE="2" NAME="SongId" xsi:type="SQLVARYCHAR"/>
<COLUMN SOURCE="3" NAME="ArtistName" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="4" NAME="SongTitle" xsi:type="SQLNVARCHAR"/>
</ROW>
</BCPFORMAT>
<?xml version="1.0"?>
<BCPFORMAT xmlns="http://schemas.microsoft.com/sqlserver/2004/bulkload/format" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<RECORD>
<FIELD ID="1" xsi:type="CharTerm" TERMINATOR=": &lt;" MAX_LENGTH="5"/>
<FIELD ID="2" xsi:type="CharTerm" TERMINATOR=" " MAX_LENGTH="100"/>
<FIELD ID="3" xsi:type="CharTerm" TERMINATOR="&gt; " MAX_LENGTH="100"/>
<FIELD ID="4" xsi:type="CharTerm" TERMINATOR=" - " MAX_LENGTH="500"/>
<FIELD ID="5" xsi:type="CharTerm" TERMINATOR=" != " MAX_LENGTH="500"/>
<FIELD ID="6" xsi:type="CharTerm" TERMINATOR=" - " MAX_LENGTH="500"/>
<FIELD ID="7" xsi:type="CharTerm" TERMINATOR="\n" MAX_LENGTH="500"/>
</RECORD>
<ROW>
<COLUMN SOURCE="1" NAME="Error" xsi:type="SQLVARYCHAR"/>
<COLUMN SOURCE="2" NAME="SongId" xsi:type="SQLVARYCHAR"/>
<COLUMN SOURCE="3" NAME="TrackId" xsi:type="SQLVARYCHAR"/>
<COLUMN SOURCE="4" NAME="ActualArtist" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="5" NAME="ActualSong" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="6" NAME="OriginalArtist" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="7" NAME="OriginalSong" xsi:type="SQLNVARCHAR"/>
</ROW>
</BCPFORMAT>
<?xml version="1.0"?>
<BCPFORMAT xmlns="http://schemas.microsoft.com/sqlserver/2004/bulkload/format" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<RECORD>
<FIELD ID="1" xsi:type="CharTerm" TERMINATOR="\t" MAX_LENGTH="100"/>
<FIELD ID="2" xsi:type="CharTerm" TERMINATOR="\t" MAX_LENGTH="100"/>
<FIELD ID="3" xsi:type="CharTerm" TERMINATOR="\n" MAX_LENGTH="100"/>
</RECORD>
<ROW>
<COLUMN SOURCE="1" NAME="UserId" xsi:type="SQLVARYCHAR"/>
<COLUMN SOURCE="2" NAME="SongId" xsi:type="SQLVARYCHAR"/>
<COLUMN SOURCE="3" NAME="ListenCount" xsi:type="SQLBIGINT"/>
</ROW>
</BCPFORMAT>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment