Last active
January 3, 2016 17:29
-
-
Save swasheck/8495710 to your computer and use it in GitHub Desktop.
Inspired by Brent Ozar's post, here's a Gist to parse SO data. Relationships, keys, and indexes not included. Oh yeah, pay attention to the xml source location as you may need to change it.http://www.brentozar.com/archive/2014/01/how-to-query-the-stackexchange-databases/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- DOWNLOAD YOUR DATA | |
/* | |
http://meta.stackoverflow.com/questions/198915/is-there-a-direct-download-link-with-a-raw-data-dump-of-stack-overflow-not-a-t/199303#199303 | |
*/ | |
USE [stack_raw] | |
GO | |
/****** Object: Table [dbo].[Badge] Script Date: 12/20/2014 3:08:08 PM ******/ | |
SET ANSI_NULLS ON | |
GO | |
SET QUOTED_IDENTIFIER ON | |
GO | |
SET ANSI_PADDING ON | |
GO | |
CREATE TABLE [dbo].[Badge]( | |
site_name varchar(50) not null, | |
[user_id] [bigint] NULL, | |
[badge_name] [nvarchar](255) NULL, | |
[badge_date] [datetime2](7) NULL | |
) ON [PRIMARY] | |
GO | |
SET ANSI_PADDING OFF | |
GO | |
USE [stack_raw] | |
GO | |
/****** Object: Table [dbo].[Comment] Script Date: 12/20/2014 3:08:11 PM ******/ | |
SET ANSI_NULLS ON | |
GO | |
SET QUOTED_IDENTIFIER ON | |
GO | |
SET ANSI_PADDING ON | |
GO | |
CREATE TABLE [dbo].[Comment]( | |
site_name varchar(50) not null, | |
[comment_id] [bigint] NULL, | |
[comment_post_id] [bigint] NULL, | |
[comment_text] [nvarchar](max) NULL, | |
[comment_creation_date] [datetime2](7) NULL, | |
[comment_user_id] [bigint] NULL | |
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY] | |
GO | |
SET ANSI_PADDING OFF | |
GO | |
USE [stack_raw] | |
GO | |
/****** Object: Table [dbo].[Post] Script Date: 12/20/2014 3:08:28 PM ******/ | |
SET ANSI_NULLS ON | |
GO | |
SET QUOTED_IDENTIFIER ON | |
GO | |
SET ANSI_PADDING ON | |
GO | |
CREATE TABLE [dbo].[Post]( | |
site_name varchar(50) not null, | |
[post_id] [bigint] NULL, | |
[post_type_id] [bigint] NULL, | |
[post_creation_date] [datetime2](7) NULL, | |
[post_score] [bigint] NULL, | |
[post_view_count] [bigint] NULL, | |
[post_body] [nvarchar](max) NULL, | |
[post_owner_user_id] [bigint] NULL, | |
[post_last_activity_date] [datetime2](7) NULL, | |
[post_title] [nvarchar](4000) NULL, | |
[post_answer_count] [int] NULL, | |
[post_comment_count] [int] NULL, | |
[post_favorite_count] [int] NULL | |
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY] | |
GO | |
SET ANSI_PADDING OFF | |
GO | |
USE [stack_raw] | |
GO | |
/****** Object: Table [dbo].[PostHistory] Script Date: 12/20/2014 3:08:34 PM ******/ | |
SET ANSI_NULLS ON | |
GO | |
SET QUOTED_IDENTIFIER ON | |
GO | |
SET ANSI_PADDING ON | |
GO | |
CREATE TABLE [dbo].[PostHistory]( | |
site_name varchar(50) not null, | |
[posthistory_id] [bigint] NULL, | |
[posthistory_type_id] [bigint] NULL, | |
[posthistory_post_id] [bigint] NULL, | |
[posthistory_revision_guid] [uniqueidentifier] NULL, | |
[posthistory_creation_date] [datetime2](7) NULL, | |
[posthistory_user_id] [bigint] NULL, | |
[posthistory_text] [nvarchar](max) NULL | |
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY] | |
GO | |
SET ANSI_PADDING OFF | |
GO | |
USE [stack_raw] | |
GO | |
/****** Object: Table [dbo].[User] Script Date: 12/20/2014 3:08:40 PM ******/ | |
SET ANSI_NULLS ON | |
GO | |
SET QUOTED_IDENTIFIER ON | |
GO | |
SET ANSI_PADDING ON | |
GO | |
CREATE TABLE [dbo].[User]( | |
site_name varchar(50) not null, | |
[user_id] [bigint] NULL, | |
[user_reputation] [bigint] NULL, | |
[user_creation_date] [datetime2](7) NULL, | |
[user_display_name] [nvarchar](255) NULL, | |
[user_last_access_date] [datetime2](7) NULL, | |
[user_location] [nvarchar](255) NULL, | |
[user_about_me] [nvarchar](max) NULL, | |
[user_views] [bigint] NULL, | |
[user_upvotes] [bigint] NULL, | |
[user_downvotes] [bigint] NULL, | |
[user_email_hash] [nvarchar](64) NULL, | |
[user_website] [nvarchar](255) NULL | |
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY] | |
GO | |
SET ANSI_PADDING OFF | |
GO | |
USE [stack_raw] | |
GO | |
/****** Object: Table [dbo].[Vote] Script Date: 12/20/2014 3:08:45 PM ******/ | |
SET ANSI_NULLS ON | |
GO | |
SET QUOTED_IDENTIFIER ON | |
GO | |
SET ANSI_PADDING ON | |
GO | |
CREATE TABLE [dbo].[Vote]( | |
site_name varchar(50) not null, | |
[vote_id] [bigint] NULL, | |
[vote_post_id] [bigint] NULL, | |
[vote_type_id] [bigint] NULL, | |
[vote_creation_date] [datetime2](7) NULL | |
) ON [PRIMARY] | |
GO | |
SET ANSI_PADDING OFF | |
GO | |
use stack_raw | |
SELECT | |
'Badges' as component, | |
CAST(bulkcolumn as xml) xdata | |
INTO dbo.badges_stage | |
FROM OPENROWSET( | |
BULK 'C:\Users\user\Downloads\stack\serverfault.com\Badges.xml', | |
SINGLE_BLOB) AS x; | |
SELECT | |
'Posts' as component, | |
CAST(bulkcolumn as xml) xdata | |
into dbo.posts_stage | |
FROM OPENROWSET( | |
BULK 'C:\Users\user\Downloads\stack\serverfault.com\Posts.xml', | |
SINGLE_BLOB) AS x; | |
SELECT | |
'PostHistory' as component, | |
CAST(bulkcolumn as xml) xdata | |
into dbo.posthistory_stage | |
FROM OPENROWSET( | |
BULK 'C:\Users\user\Downloads\stack\serverfault.com\PostHistory.xml', | |
SINGLE_BLOB) AS x; | |
SELECT | |
'Users' as component, | |
CAST(bulkcolumn as xml) xdata | |
into dbo.users_stage | |
FROM OPENROWSET( | |
BULK 'C:\Users\user\Downloads\stack\serverfault.com\Users.xml', | |
SINGLE_BLOB) AS x; | |
SELECT | |
'Votes' as component, | |
CAST(bulkcolumn as xml) xdata | |
into dbo.votes_stage | |
FROM OPENROWSET( | |
BULK 'C:\Users\user\Downloads\stack\serverfault.com\Votes.xml', | |
SINGLE_BLOB) AS x; | |
SELECT | |
'Comments' as component, | |
CAST(bulkcolumn as xml) xdata | |
into dbo.comments_stage | |
FROM OPENROWSET( | |
BULK 'C:\Users\user\Downloads\stack\serverfault.com\Comments.xml', | |
SINGLE_BLOB) AS x; | |
-- shred it | |
insert INTO [User] | |
select | |
'serverfault' as site_name, | |
x.n.value('(@Id)','bigint') user_id, | |
x.n.value('(@Reputation)','bigint') user_reputation, | |
x.n.value('(@CreationDate)','datetime2') user_creation_date, | |
x.n.value('(@DisplayName)','nvarchar(255)') user_display_name, | |
x.n.value('(@LastAccessDate)','datetime2') user_last_access_date, | |
x.n.value('(@Location)','nvarchar(255)') user_location, | |
x.n.value('(@AboutMe)','nvarchar(max)') user_about_me, | |
x.n.value('(@Views)','bigint') user_views, | |
x.n.value('(@UpVotes)','bigint') user_upvotes, | |
x.n.value('(@DownVotes)','bigint') user_downvotes, | |
x.n.value('(@EmailHash)','nvarchar(64)') user_email_hash, | |
x.n.value('(@WebsiteUrl)','nvarchar(255)') user_website | |
from users_stage u | |
cross apply u.xdata.nodes('/users/*') x(n); | |
insert INTO Badge | |
select | |
'serverfault' as site_name, | |
x.n.value('(@UserId)','bigint') user_id, | |
x.n.value('(@Name)','nvarchar(255)') badge_name, | |
x.n.value('(@Date)','datetime2') badge_date | |
from badges_stage u | |
cross apply u.xdata.nodes('/badges/*') x(n); | |
insert into comment | |
select | |
'serverfault' as site_name, | |
x.n.value('(@Id)','bigint') comment_id, | |
x.n.value('(@PostId)','bigint') comment_post_id, | |
x.n.value('(@Text)','nvarchar(max)') comment_text, | |
x.n.value('(@CreationDate)','datetime2') comment_creation_date, | |
x.n.value('(@UserId)','bigint') comment_user_id | |
from comments_stage u | |
cross apply u.xdata.nodes('/comments/*') x(n); | |
insert INTO Vote | |
select | |
'serverfault' as site_name, | |
x.n.value('(@Id)','bigint') vote_id, | |
x.n.value('(@PostId)','bigint') vote_post_id, | |
x.n.value('(@VoteTypeId)','bigint') vote_type_id, | |
x.n.value('(@CreationDate)','datetime2') vote_creation_date | |
--x.n.query('.') b | |
from votes_stage u | |
cross apply u.xdata.nodes('/votes/*') x(n); | |
insert INTO Post | |
select | |
'serverfault' as site_name, | |
x.n.value('(@Id)','bigint') post_id, | |
--x.n.value('(@PostId)','bigint') votes_post_id, | |
x.n.value('(@PostTypeId)','bigint') post_type_id, | |
x.n.value('(@CreationDate)','datetime2') post_creation_date, | |
x.n.value('(@Score)','bigint') post_score, | |
x.n.value('(@ViewCount)','bigint') post_view_count, | |
x.n.value('(@Body)','nvarchar(max)') post_body, | |
x.n.value('(@OwnerUserId)','bigint') post_owner_user_id, | |
x.n.value('(@LastActivityDate)','datetime2') post_last_activity_date, | |
x.n.value('(@Title)','nvarchar(4000)') post_title, | |
x.n.value('(@AnswerCount)','int') post_answer_count, | |
x.n.value('(@CommentCount)','int') post_comment_count, | |
x.n.value('(@FavoriteCount)','int') post_favorite_count | |
--x.n.query('.') b | |
from posts_stage u | |
cross apply u.xdata.nodes('/posts/*') x(n); | |
insert INTO PostHistory | |
select | |
'serverfault' as site_name, | |
x.n.value('(@Id)','bigint') posthistory_id, | |
x.n.value('(@PostHistoryTypeId)','bigint') posthistory_type_id, | |
x.n.value('(@PostId)','bigint') posthistory_post_id, | |
x.n.value('(@RevisionGUID)','uniqueidentifier') posthistory_revision_guid, | |
x.n.value('(@CreationDate)','datetime2') posthistory_creation_date, | |
x.n.value('(@UserId)','bigint') posthistory_user_id, | |
x.n.value('(@Text)','nvarchar(max)') posthistory_text | |
--x.n.query('.') b | |
from posthistory_stage u | |
cross apply u.xdata.nodes('/posthistory/*') x(n); | |
DROP TABLE [badges_stage]; | |
DROP TABLE [comments_stage]; | |
DROP TABLE [posthistory_stage]; | |
DROP TABLE [posts_stage]; | |
DROP TABLE [users_stage]; | |
DROP TABLE [votes_stage]; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment