Skip to content

Instantly share code, notes, and snippets.

@swasheck
Last active January 3, 2016 17:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save swasheck/8495710 to your computer and use it in GitHub Desktop.
Save swasheck/8495710 to your computer and use it in GitHub Desktop.
Inspired by Brent Ozar's post, here's a Gist to parse SO data. Relationships, keys, and indexes not included. Oh yeah, pay attention to the xml source location as you may need to change it.http://www.brentozar.com/archive/2014/01/how-to-query-the-stackexchange-databases/
-- DOWNLOAD YOUR DATA
/*
http://meta.stackoverflow.com/questions/198915/is-there-a-direct-download-link-with-a-raw-data-dump-of-stack-overflow-not-a-t/199303#199303
*/
USE [stack_raw]
GO
/****** Object: Table [dbo].[Badge] Script Date: 12/20/2014 3:08:08 PM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
SET ANSI_PADDING ON
GO
CREATE TABLE [dbo].[Badge](
site_name varchar(50) not null,
[user_id] [bigint] NULL,
[badge_name] [nvarchar](255) NULL,
[badge_date] [datetime2](7) NULL
) ON [PRIMARY]
GO
SET ANSI_PADDING OFF
GO
USE [stack_raw]
GO
/****** Object: Table [dbo].[Comment] Script Date: 12/20/2014 3:08:11 PM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
SET ANSI_PADDING ON
GO
CREATE TABLE [dbo].[Comment](
site_name varchar(50) not null,
[comment_id] [bigint] NULL,
[comment_post_id] [bigint] NULL,
[comment_text] [nvarchar](max) NULL,
[comment_creation_date] [datetime2](7) NULL,
[comment_user_id] [bigint] NULL
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
GO
SET ANSI_PADDING OFF
GO
USE [stack_raw]
GO
/****** Object: Table [dbo].[Post] Script Date: 12/20/2014 3:08:28 PM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
SET ANSI_PADDING ON
GO
CREATE TABLE [dbo].[Post](
site_name varchar(50) not null,
[post_id] [bigint] NULL,
[post_type_id] [bigint] NULL,
[post_creation_date] [datetime2](7) NULL,
[post_score] [bigint] NULL,
[post_view_count] [bigint] NULL,
[post_body] [nvarchar](max) NULL,
[post_owner_user_id] [bigint] NULL,
[post_last_activity_date] [datetime2](7) NULL,
[post_title] [nvarchar](4000) NULL,
[post_answer_count] [int] NULL,
[post_comment_count] [int] NULL,
[post_favorite_count] [int] NULL
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
GO
SET ANSI_PADDING OFF
GO
USE [stack_raw]
GO
/****** Object: Table [dbo].[PostHistory] Script Date: 12/20/2014 3:08:34 PM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
SET ANSI_PADDING ON
GO
CREATE TABLE [dbo].[PostHistory](
site_name varchar(50) not null,
[posthistory_id] [bigint] NULL,
[posthistory_type_id] [bigint] NULL,
[posthistory_post_id] [bigint] NULL,
[posthistory_revision_guid] [uniqueidentifier] NULL,
[posthistory_creation_date] [datetime2](7) NULL,
[posthistory_user_id] [bigint] NULL,
[posthistory_text] [nvarchar](max) NULL
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
GO
SET ANSI_PADDING OFF
GO
USE [stack_raw]
GO
/****** Object: Table [dbo].[User] Script Date: 12/20/2014 3:08:40 PM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
SET ANSI_PADDING ON
GO
CREATE TABLE [dbo].[User](
site_name varchar(50) not null,
[user_id] [bigint] NULL,
[user_reputation] [bigint] NULL,
[user_creation_date] [datetime2](7) NULL,
[user_display_name] [nvarchar](255) NULL,
[user_last_access_date] [datetime2](7) NULL,
[user_location] [nvarchar](255) NULL,
[user_about_me] [nvarchar](max) NULL,
[user_views] [bigint] NULL,
[user_upvotes] [bigint] NULL,
[user_downvotes] [bigint] NULL,
[user_email_hash] [nvarchar](64) NULL,
[user_website] [nvarchar](255) NULL
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
GO
SET ANSI_PADDING OFF
GO
USE [stack_raw]
GO
/****** Object: Table [dbo].[Vote] Script Date: 12/20/2014 3:08:45 PM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
SET ANSI_PADDING ON
GO
CREATE TABLE [dbo].[Vote](
site_name varchar(50) not null,
[vote_id] [bigint] NULL,
[vote_post_id] [bigint] NULL,
[vote_type_id] [bigint] NULL,
[vote_creation_date] [datetime2](7) NULL
) ON [PRIMARY]
GO
SET ANSI_PADDING OFF
GO
use stack_raw
SELECT
'Badges' as component,
CAST(bulkcolumn as xml) xdata
INTO dbo.badges_stage
FROM OPENROWSET(
BULK 'C:\Users\user\Downloads\stack\serverfault.com\Badges.xml',
SINGLE_BLOB) AS x;
SELECT
'Posts' as component,
CAST(bulkcolumn as xml) xdata
into dbo.posts_stage
FROM OPENROWSET(
BULK 'C:\Users\user\Downloads\stack\serverfault.com\Posts.xml',
SINGLE_BLOB) AS x;
SELECT
'PostHistory' as component,
CAST(bulkcolumn as xml) xdata
into dbo.posthistory_stage
FROM OPENROWSET(
BULK 'C:\Users\user\Downloads\stack\serverfault.com\PostHistory.xml',
SINGLE_BLOB) AS x;
SELECT
'Users' as component,
CAST(bulkcolumn as xml) xdata
into dbo.users_stage
FROM OPENROWSET(
BULK 'C:\Users\user\Downloads\stack\serverfault.com\Users.xml',
SINGLE_BLOB) AS x;
SELECT
'Votes' as component,
CAST(bulkcolumn as xml) xdata
into dbo.votes_stage
FROM OPENROWSET(
BULK 'C:\Users\user\Downloads\stack\serverfault.com\Votes.xml',
SINGLE_BLOB) AS x;
SELECT
'Comments' as component,
CAST(bulkcolumn as xml) xdata
into dbo.comments_stage
FROM OPENROWSET(
BULK 'C:\Users\user\Downloads\stack\serverfault.com\Comments.xml',
SINGLE_BLOB) AS x;
-- shred it
insert INTO [User]
select
'serverfault' as site_name,
x.n.value('(@Id)','bigint') user_id,
x.n.value('(@Reputation)','bigint') user_reputation,
x.n.value('(@CreationDate)','datetime2') user_creation_date,
x.n.value('(@DisplayName)','nvarchar(255)') user_display_name,
x.n.value('(@LastAccessDate)','datetime2') user_last_access_date,
x.n.value('(@Location)','nvarchar(255)') user_location,
x.n.value('(@AboutMe)','nvarchar(max)') user_about_me,
x.n.value('(@Views)','bigint') user_views,
x.n.value('(@UpVotes)','bigint') user_upvotes,
x.n.value('(@DownVotes)','bigint') user_downvotes,
x.n.value('(@EmailHash)','nvarchar(64)') user_email_hash,
x.n.value('(@WebsiteUrl)','nvarchar(255)') user_website
from users_stage u
cross apply u.xdata.nodes('/users/*') x(n);
insert INTO Badge
select
'serverfault' as site_name,
x.n.value('(@UserId)','bigint') user_id,
x.n.value('(@Name)','nvarchar(255)') badge_name,
x.n.value('(@Date)','datetime2') badge_date
from badges_stage u
cross apply u.xdata.nodes('/badges/*') x(n);
insert into comment
select
'serverfault' as site_name,
x.n.value('(@Id)','bigint') comment_id,
x.n.value('(@PostId)','bigint') comment_post_id,
x.n.value('(@Text)','nvarchar(max)') comment_text,
x.n.value('(@CreationDate)','datetime2') comment_creation_date,
x.n.value('(@UserId)','bigint') comment_user_id
from comments_stage u
cross apply u.xdata.nodes('/comments/*') x(n);
insert INTO Vote
select
'serverfault' as site_name,
x.n.value('(@Id)','bigint') vote_id,
x.n.value('(@PostId)','bigint') vote_post_id,
x.n.value('(@VoteTypeId)','bigint') vote_type_id,
x.n.value('(@CreationDate)','datetime2') vote_creation_date
--x.n.query('.') b
from votes_stage u
cross apply u.xdata.nodes('/votes/*') x(n);
insert INTO Post
select
'serverfault' as site_name,
x.n.value('(@Id)','bigint') post_id,
--x.n.value('(@PostId)','bigint') votes_post_id,
x.n.value('(@PostTypeId)','bigint') post_type_id,
x.n.value('(@CreationDate)','datetime2') post_creation_date,
x.n.value('(@Score)','bigint') post_score,
x.n.value('(@ViewCount)','bigint') post_view_count,
x.n.value('(@Body)','nvarchar(max)') post_body,
x.n.value('(@OwnerUserId)','bigint') post_owner_user_id,
x.n.value('(@LastActivityDate)','datetime2') post_last_activity_date,
x.n.value('(@Title)','nvarchar(4000)') post_title,
x.n.value('(@AnswerCount)','int') post_answer_count,
x.n.value('(@CommentCount)','int') post_comment_count,
x.n.value('(@FavoriteCount)','int') post_favorite_count
--x.n.query('.') b
from posts_stage u
cross apply u.xdata.nodes('/posts/*') x(n);
insert INTO PostHistory
select
'serverfault' as site_name,
x.n.value('(@Id)','bigint') posthistory_id,
x.n.value('(@PostHistoryTypeId)','bigint') posthistory_type_id,
x.n.value('(@PostId)','bigint') posthistory_post_id,
x.n.value('(@RevisionGUID)','uniqueidentifier') posthistory_revision_guid,
x.n.value('(@CreationDate)','datetime2') posthistory_creation_date,
x.n.value('(@UserId)','bigint') posthistory_user_id,
x.n.value('(@Text)','nvarchar(max)') posthistory_text
--x.n.query('.') b
from posthistory_stage u
cross apply u.xdata.nodes('/posthistory/*') x(n);
DROP TABLE [badges_stage];
DROP TABLE [comments_stage];
DROP TABLE [posthistory_stage];
DROP TABLE [posts_stage];
DROP TABLE [users_stage];
DROP TABLE [votes_stage];
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment