Last active
September 20, 2017 10:33
-
-
Save arvindshmicrosoft/ae25112b146ce276f375c8908d780fe8 to your computer and use it in GitHub Desktop.
MillionSongDatasetinSQLServer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE DATABASE [MillionSongDataset] | |
GO | |
ALTER DATABASE [MillionSongDataset] SET RECOVERY SIMPLE | |
GO | |
USE [MillionSongDataset] | |
GO | |
CREATE TABLE [dbo].[train_triplets]( | |
[UserId] [varchar](80) NULL, | |
[SongId] [varchar](36) NULL, | |
[ListenCount] [bigint] NULL, | |
INDEX CCI_train_triplets CLUSTERED COLUMNSTORE | |
) | |
GO | |
CREATE TABLE [dbo].[unique_tracks]( | |
[TrackId] [varchar](50) NULL, | |
[SongId] [varchar](50) NULL, | |
[ArtistName] [nvarchar](500) NULL, | |
[SongTitle] [nvarchar](500) NULL, | |
INDEX CCI_unique_tracks CLUSTERED COLUMNSTORE | |
) | |
GO | |
/* | |
Importing and using the Million Song Dataset (https://labrosa.ee.columbia.edu/millionsong/) in SQL Server 2017 | |
Citation: | |
Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, and Paul Lamere. | |
The Million Song Dataset. In Proceedings of the 12th International Society | |
for Music Information Retrieval Conference (ISMIR 2011), 2011. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
This sample code is not supported under any Microsoft standard support program or service. | |
The entire risk arising out of the use or performance of the sample scripts and documentation remains with you. | |
In no event shall Microsoft, its authors, or anyone else involved in the creation, production, or delivery of the scripts | |
be liable for any damages whatsoever (including, without limitation, damages for loss of business profits, | |
business interruption, loss of business information, or other pecuniary loss) arising out of the use of or inability | |
to use the sample scripts or documentation, even if Microsoft has been advised of the possibility of such damages. | |
*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
USE MillionSongDataset | |
GO | |
-- Reference: https://labrosa.ee.columbia.edu/millionsong/pages/getting-dataset | |
-- Data file: http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/unique_tracks.txt | |
-- An interesting thing to note here is the usage of code page 65001, which is UTF-8. The unique_tracks.txt file is in UTF-8 format. | |
-- See https://support.microsoft.com/en-us/help/3136780/utf-8-encoding-support-for-the-bcp-utility-and-bulk-insert-transact-sql-command-in-sql-server-2014-sp2 for a description of UTF-8 support in SQL | |
INSERT dbo.unique_tracks | |
SELECT TrackId, SongId, ArtistName, SongTitle | |
FROM OPENROWSET(BULK 'C:\MSD\unique_tracks.txt', | |
FORMATFILE = 'C:\MSD\echonesttracks.xml', | |
CODEPAGE = '65001') AS RawData | |
GO | |
-- Reference: https://labrosa.ee.columbia.edu/millionsong/tasteprofile | |
-- Data file (unzip manually please!): http://labrosa.ee.columbia.edu/millionsong/sites/default/files/challenge/train_triplets.txt.zip | |
INSERT dbo.[train_triplets] | |
SELECT UserId, SongId, ListenCount | |
FROM OPENROWSET(BULK 'C:\MSD\train_triplets.txt', | |
FORMATFILE = 'C:\MSD\train_triplets.xml') AS RawData | |
GO | |
-- Fix up errors in the Echo Nest data | |
-- References: | |
-- https://labrosa.ee.columbia.edu/millionsong/blog/12-1-2-matching-errors-taste-profile-and-msd | |
-- https://labrosa.ee.columbia.edu/millionsong/blog/12-2-12-fixing-matching-errors | |
-- Data file: | |
-- http://labrosa.ee.columbia.edu/millionsong/sites/default/files/tasteprofile/sid_mismatches.txt | |
UPDATE orig | |
SET orig.SongTitle = final.ActualSong, | |
orig.ArtistName = final.ActualArtist | |
FROM | |
unique_tracks orig | |
JOIN ( | |
SELECT Error, SongId, TrackId, ActualArtist, ActualSong, OriginalArtist, OriginalSong | |
FROM | |
OPENROWSET(BULK 'C:\MSD\sid_mismatches.txt', | |
FORMATFILE = 'C:\MSD\sid_mismatches.xml', CODEPAGE = '65001') AS MismatchedSongs | |
) AS final | |
ON orig.SongId = final.SongId | |
WHERE ActualArtist IS NOT NULL | |
AND ActualSong IS NOT NULL | |
GO | |
/* | |
Importing and using the Million Song Dataset (https://labrosa.ee.columbia.edu/millionsong/) in SQL Server 2017 | |
Citation: | |
Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, and Paul Lamere. | |
The Million Song Dataset. In Proceedings of the 12th International Society | |
for Music Information Retrieval Conference (ISMIR 2011), 2011. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
This sample code is not supported under any Microsoft standard support program or service. | |
The entire risk arising out of the use or performance of the sample scripts and documentation remains with you. | |
In no event shall Microsoft, its authors, or anyone else involved in the creation, production, or delivery of the scripts | |
be liable for any damages whatsoever (including, without limitation, damages for loss of business profits, | |
business interruption, loss of business information, or other pecuniary loss) arising out of the use of or inability | |
to use the sample scripts or documentation, even if Microsoft has been advised of the possibility of such damages. | |
*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
USE MillionSongDataset | |
GO | |
-- This (node) table holds the main details of the song | |
CREATE TABLE UniqueSong | |
( | |
SongId VARCHAR(50), | |
SongTitle VARCHAR(500), | |
ArtistName NVARCHAR(500) | |
) | |
AS Node | |
GO | |
-- This (node) table holds the user IDs. We do not have (nor do we need, for this demo) any other details about the user | |
CREATE TABLE UniqueUser | |
( | |
UserId VARCHAR(80) | |
) | |
AS node | |
GO | |
-- This edge table stores the relationships - see the 'ConvertData' script on how the 'from' (user) and 'to' (song) are specified, | |
-- and an attribute (ListenCount) on the edge which stores the number of times the user listened to this song. | |
CREATE TABLE Likes | |
( | |
ListenCount BIGINT | |
) | |
AS Edge | |
GO | |
/* | |
Importing and using the Million Song Dataset (https://labrosa.ee.columbia.edu/millionsong/) in SQL Server 2017 | |
Citation: | |
Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, and Paul Lamere. | |
The Million Song Dataset. In Proceedings of the 12th International Society | |
for Music Information Retrieval Conference (ISMIR 2011), 2011. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
This sample code is not supported under any Microsoft standard support program or service. | |
The entire risk arising out of the use or performance of the sample scripts and documentation remains with you. | |
In no event shall Microsoft, its authors, or anyone else involved in the creation, production, or delivery of the scripts | |
be liable for any damages whatsoever (including, without limitation, damages for loss of business profits, | |
business interruption, loss of business information, or other pecuniary loss) arising out of the use of or inability | |
to use the sample scripts or documentation, even if Microsoft has been advised of the possibility of such damages. | |
*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Use MillionSongDataset; | |
GO | |
-- This step inserts the songs into the Node table (UniqueSong), which internally generates a unique $node_id pseudo-column per row | |
INSERT UniqueSong (SongId, SongTitle, ArtistName) | |
SELECT DISTINCT | |
SongId, | |
SongTitle, | |
ArtistName | |
FROM unique_tracks | |
GO | |
-- This step inserts the users into the Node table (UniqueUser), which internally generates a unique $node_id pseudo-column per row | |
INSERT UniqueUser (UserId) | |
SELECT DISTINCT UserId | |
FROM dbo.train_triplets | |
GO | |
-- This step inserts the from and to relationships between users and songs. To do this, it uses the previously generated $node_id values | |
-- for each user and each song, and then inserts those along with the ListenCount for that relationship into the Likes table | |
-- Note the use of $from_id and $to_id. These are pseudo-columns which allow the edge table to track relationships correctly. | |
INSERT Likes | |
($from_id, $to_id, ListenCount) | |
SELECT | |
U.$node_id, | |
S.$node_id, | |
T.ListenCount | |
FROM | |
dbo.train_triplets T | |
JOIN UniqueUser U ON U.UserId = T.UserId | |
JOIN UniqueSong S ON T.SongId = S.SongId | |
GO | |
-- let's sample the data now! | |
-- Note the $node_id column (represented as a JSON string) in the output | |
SELECT TOP (10) * | |
FROM UniqueUser; | |
-- Note the $node_id column (represented as a JSON string) in the output | |
SELECT TOP (10) * | |
FROM UniqueSong; | |
-- Note the $edge_id, $from_id, $to_id pseudo-columns (each represented as a JSON string) in the output | |
SELECT TOP (10) * | |
FROM Likes; | |
/* | |
Importing and using the Million Song Dataset (https://labrosa.ee.columbia.edu/millionsong/) in SQL Server 2017 | |
Citation: | |
Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, and Paul Lamere. | |
The Million Song Dataset. In Proceedings of the 12th International Society | |
for Music Information Retrieval Conference (ISMIR 2011), 2011. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
This sample code is not supported under any Microsoft standard support program or service. | |
The entire risk arising out of the use or performance of the sample scripts and documentation remains with you. | |
In no event shall Microsoft, its authors, or anyone else involved in the creation, production, or delivery of the scripts | |
be liable for any damages whatsoever (including, without limitation, damages for loss of business profits, | |
business interruption, loss of business information, or other pecuniary loss) arising out of the use of or inability | |
to use the sample scripts or documentation, even if Microsoft has been advised of the possibility of such damages. | |
*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
USE MillionSongDataset | |
GO | |
CREATE CLUSTERED columnstore INDEX cci_songs | |
ON UniqueSong; | |
GO | |
CREATE CLUSTERED columnstore INDEX cci_users | |
ON UniqueUser; | |
GO | |
CREATE CLUSTERED columnstore INDEX cci_likes | |
ON Likes; | |
GO | |
/* | |
Importing and using the Million Song Dataset (https://labrosa.ee.columbia.edu/millionsong/) in SQL Server 2017 | |
Citation: | |
Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, and Paul Lamere. | |
The Million Song Dataset. In Proceedings of the 12th International Society | |
for Music Information Retrieval Conference (ISMIR 2011), 2011. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
This sample code is not supported under any Microsoft standard support program or service. | |
The entire risk arising out of the use or performance of the sample scripts and documentation remains with you. | |
In no event shall Microsoft, its authors, or anyone else involved in the creation, production, or delivery of the scripts | |
be liable for any damages whatsoever (including, without limitation, damages for loss of business profits, | |
business interruption, loss of business information, or other pecuniary loss) arising out of the use of or inability | |
to use the sample scripts or documentation, even if Microsoft has been advised of the possibility of such damages. | |
*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Use MillionSongDataset | |
GO | |
-- Here, we plot a 'relationship graph' for users who heard the song 'Kryptonite'. | |
-- The resultant PNG file shows a graph with users who heard this song, and also plots other songs they listened to. | |
-- To do this we are using SQL Server R Services and the 'igraph' package in R to render the graph as a PNG file | |
-- As a pre-requisite, you must have installed the 'igraph' package in R.exe | |
-- The documentation at https://docs.microsoft.com/en-us/sql/advanced-analytics/r/install-additional-r-packages-on-sql-server explains how to do this | |
-- Note: for SQL Server 2017, you must modify the library path to use MSSQL14 instead of the MSSQL13 specified in the documentation | |
exec sp_execute_external_script @language = N'R', | |
@script = N' | |
require(igraph) | |
g <- graph.data.frame(graphdf) | |
V(g)$label.cex <- 2 | |
png(filename = "c:\\MSD\\plot1.png", height = 6000, width = 6000, res = 100); | |
plot(g, vertex.label.family = "sans", vertex.size = 5) | |
dev.off() | |
', | |
@input_data_1 = N'select distinct LEFT(UserId, 5) as UserId, LEFT(REPLACE(ArtistName, '' '', ''''), 15) as ArtistName | |
from | |
( | |
select TOP 500 U.UserId, SimilarSong.ArtistName, ROW_NUMBER() OVER(PARTITION BY UserId ORDER BY LikesOther.ListenCount desc) as RowNum | |
from UniqueSong as MySong, | |
UniqueUser as U, | |
Likes as LikesOther, | |
Likes as LikesThis, | |
UniqueSong as SimilarSong | |
where MySong.SongTitle = ''Kryptonite'' | |
and MATCH(SimilarSong<-(LikesOther)-U-(LikesThis)->MySong) | |
) as InnerTable | |
where RowNum <= 20 | |
order by UserId', | |
@input_data_1_name = N'graphdf' | |
GO | |
/* | |
Importing and using the Million Song Dataset (https://labrosa.ee.columbia.edu/millionsong/) in SQL Server 2017 | |
Citation: | |
Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, and Paul Lamere. | |
The Million Song Dataset. In Proceedings of the 12th International Society | |
for Music Information Retrieval Conference (ISMIR 2011), 2011. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
This sample code is not supported under any Microsoft standard support program or service. | |
The entire risk arising out of the use or performance of the sample scripts and documentation remains with you. | |
In no event shall Microsoft, its authors, or anyone else involved in the creation, production, or delivery of the scripts | |
be liable for any damages whatsoever (including, without limitation, damages for loss of business profits, | |
business interruption, loss of business information, or other pecuniary loss) arising out of the use of or inability | |
to use the sample scripts or documentation, even if Microsoft has been advised of the possibility of such damages. | |
*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0"?> | |
<BCPFORMAT xmlns="http://schemas.microsoft.com/sqlserver/2004/bulkload/format" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> | |
<RECORD> | |
<FIELD ID="1" xsi:type="CharTerm" TERMINATOR="<SEP>" MAX_LENGTH="100"/> | |
<FIELD ID="2" xsi:type="CharTerm" TERMINATOR="<SEP>" MAX_LENGTH="100"/> | |
<FIELD ID="3" xsi:type="CharTerm" TERMINATOR="<SEP>" MAX_LENGTH="500"/> | |
<FIELD ID="4" xsi:type="CharTerm" TERMINATOR="\n" MAX_LENGTH="500"/> | |
</RECORD> | |
<ROW> | |
<COLUMN SOURCE="1" NAME="TrackId" xsi:type="SQLVARYCHAR"/> | |
<COLUMN SOURCE="2" NAME="SongId" xsi:type="SQLVARYCHAR"/> | |
<COLUMN SOURCE="3" NAME="ArtistName" xsi:type="SQLNVARCHAR"/> | |
<COLUMN SOURCE="4" NAME="SongTitle" xsi:type="SQLNVARCHAR"/> | |
</ROW> | |
</BCPFORMAT> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0"?> | |
<BCPFORMAT xmlns="http://schemas.microsoft.com/sqlserver/2004/bulkload/format" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> | |
<RECORD> | |
<FIELD ID="1" xsi:type="CharTerm" TERMINATOR=": <" MAX_LENGTH="5"/> | |
<FIELD ID="2" xsi:type="CharTerm" TERMINATOR=" " MAX_LENGTH="100"/> | |
<FIELD ID="3" xsi:type="CharTerm" TERMINATOR="> " MAX_LENGTH="100"/> | |
<FIELD ID="4" xsi:type="CharTerm" TERMINATOR=" - " MAX_LENGTH="500"/> | |
<FIELD ID="5" xsi:type="CharTerm" TERMINATOR=" != " MAX_LENGTH="500"/> | |
<FIELD ID="6" xsi:type="CharTerm" TERMINATOR=" - " MAX_LENGTH="500"/> | |
<FIELD ID="7" xsi:type="CharTerm" TERMINATOR="\n" MAX_LENGTH="500"/> | |
</RECORD> | |
<ROW> | |
<COLUMN SOURCE="1" NAME="Error" xsi:type="SQLVARYCHAR"/> | |
<COLUMN SOURCE="2" NAME="SongId" xsi:type="SQLVARYCHAR"/> | |
<COLUMN SOURCE="3" NAME="TrackId" xsi:type="SQLVARYCHAR"/> | |
<COLUMN SOURCE="4" NAME="ActualArtist" xsi:type="SQLNVARCHAR"/> | |
<COLUMN SOURCE="5" NAME="ActualSong" xsi:type="SQLNVARCHAR"/> | |
<COLUMN SOURCE="6" NAME="OriginalArtist" xsi:type="SQLNVARCHAR"/> | |
<COLUMN SOURCE="7" NAME="OriginalSong" xsi:type="SQLNVARCHAR"/> | |
</ROW> | |
</BCPFORMAT> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0"?> | |
<BCPFORMAT xmlns="http://schemas.microsoft.com/sqlserver/2004/bulkload/format" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> | |
<RECORD> | |
<FIELD ID="1" xsi:type="CharTerm" TERMINATOR="\t" MAX_LENGTH="100"/> | |
<FIELD ID="2" xsi:type="CharTerm" TERMINATOR="\t" MAX_LENGTH="100"/> | |
<FIELD ID="3" xsi:type="CharTerm" TERMINATOR="\n" MAX_LENGTH="100"/> | |
</RECORD> | |
<ROW> | |
<COLUMN SOURCE="1" NAME="UserId" xsi:type="SQLVARYCHAR"/> | |
<COLUMN SOURCE="2" NAME="SongId" xsi:type="SQLVARYCHAR"/> | |
<COLUMN SOURCE="3" NAME="ListenCount" xsi:type="SQLBIGINT"/> | |
</ROW> | |
</BCPFORMAT> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment