Skip to content

Instantly share code, notes, and snippets.

@hotzenklotz
Forked from gousiosg/stackoverflow.sql
Last active August 29, 2015 14:10
Show Gist options
  • Save hotzenklotz/20c423b1d8f227ddc6df to your computer and use it in GitHub Desktop.
Save hotzenklotz/20c423b1d8f227ddc6df to your computer and use it in GitHub Desktop.
# Copyright (c) 2013 Georgios Gousios
# MIT-licensed
create database stackoverflow DEFAULT CHARACTER SET utf8 DEFAULT COLLATE utf8_general_ci;
use stackoverflow;
create table badges (
Id INT NOT NULL PRIMARY KEY,
UserId INT,
Name VARCHAR(50),
CreationDate DATETIME
);
CREATE TABLE comments (
Id INT NOT NULL PRIMARY KEY,
PostId INT NOT NULL,
Score INT NOT NULL DEFAULT 0,
Text TEXT,
CreationDate DATETIME,
UserId INT
);
CREATE TABLE post_history (
Id INT NOT NULL PRIMARY KEY,
PostHistoryTypeId SMALLINT NOT NULL,
PostId INT NOT NULL,
RevisionGUID VARCHAR(36),
CreationDate DATETIME,
UserId INT,
Text MEDIUMTEXT
);
CREATE TABLE posts (
Id INT NOT NULL PRIMARY KEY,
PostTypeId SMALLINT,
AcceptedAnswerId INT,
ParentId INT,
Score INT NULL,
ViewCount INT NULL,
Body MEDIUMTEXT NULL,
OwnerUserId INT,
LastEditorUserId INT,
LastEditDate DATETIME,
LastActivityDate DATETIME,
Title varchar(256),
Tags VARCHAR(256),
AnswerCount INT DEFAULT 0,
CommentCount INT NOT NULL DEFAULT 0,
FavoriteCount INT DEFAULT 0,
CreationDate DATETIME
);
CREATE TABLE users (
Id INT NOT NULL PRIMARY KEY,
Reputation INT NOT NULL,
CreationDate DATETIME,
DisplayName VARCHAR(50) NULL,
LastAccessDate DATETIME,
Views INT DEFAULT 0,
WebsiteUrl VARCHAR(256) NULL,
Location VARCHAR(256) NULL,
AboutMe TEXT NULL,
Age INT,
UpVotes INT,
DownVotes INT,
EmailHash VARCHAR(32)
);
CREATE TABLE votes (
Id INT NOT NULL PRIMARY KEY,
PostId INT NOT NULL,
VoteTypeId SMALLINT,
CreationDate DATETIME
);
load xml infile 'ABSOLUTE_PATH_TO_XML_FILE/Badges.xml'
into table badges
rows identified by '<row>';
load xml infile 'ABSOLUTE_PATH_TO_XML_FILE/Comments.xml'
into table comments
rows identified by '<row>';
load xml infile 'ABSOLUTE_PATH_TO_XML_FILE/PostHistory.xml'
into table post_history
rows identified by '<row>';
load xml infile 'ABSOLUTE_PATH_TO_XML_FILE/Posts.xml'
into table posts
rows identified by '<row>';
load xml infile 'ABSOLUTE_PATH_TO_XML_FILE/Users.xml'
into table users
rows identified by '<row>';
load xml infile 'ABSOLUTE_PATH_TO_XML_FILE/Votes.xml'
into table votes
rows identified by '<row>';
create index badges_idx_1 on badges(UserId);
create index comments_idx_1 on comments(PostId);
create index comments_idx_2 on comments(UserId);
create index post_history_idx_1 on post_history(PostId);
create index post_history_idx_2 on post_history(UserId);
create index posts_idx_1 on posts(AcceptedAnswerId);
create index posts_idx_2 on posts(ParentId);
create index posts_idx_3 on posts(OwnerUserId);
create index posts_idx_4 on posts(LastEditorUserId);
create index votes_idx_1 on votes(PostId);
#!/usr/bin/env ruby
# Copyright (c) 2013 Georgios Gousios
# MIT-licensed
require 'sequel'
require 'mysql2'
require 'cgi'
Sequel::Database.extension :pagination
DB = Sequel.connect("mysql2://#{ARGV[1]}:#{ARGV[2]}@#{ARGV[0]}/#{ARGV[3]}")
table = ARGV[4].to_sym
field = ARGV[5].to_sym
lines = 0
lines_with_escapes = 0
DB.from(table.to_sym).select(:Id, field).each_page(1000) do |page|
page.each do |row|
lines += 1
text = row[field]
next if text.nil?
text_unescaped = CGI.unescapeHTML(text)
if text_unescaped != text
lines_with_escapes += 1
DB[table].filter(:Id => row[:Id]).update(field => text_unescaped)
end
print "\r #{lines} lines, #{lines_with_escapes} with escapes"
end
end

MySQL script to import stackexchange data dumps (as retrieved from here).

The accompanying Ruby script will un-escape various XML-escaped characters that the MySQL import process fails to do.

Make sure to change the ABSOLUTE_PATH_TO_XML_FILE in SQL script to point to your copy of the data dump.

@tmbo
Copy link

tmbo commented Dec 16, 2014

Votes Should be:

CREATE TABLE votes (
    Id INT NOT NULL PRIMARY KEY,
    PostId INT NOT NULL,
    VoteTypeId SMALLINT,
    UserId INT NULL,
    CreationDate DATETIME,
    BountyAmount INT NULL
);

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment