Skip to content

Instantly share code, notes, and snippets.

@emanoelbarreiros
Forked from megansquire/stackoverflow.sql
Last active November 12, 2022 16:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save emanoelbarreiros/c164a60e98a7482cde22 to your computer and use it in GitHub Desktop.
Save emanoelbarreiros/c164a60e98a7482cde22 to your computer and use it in GitHub Desktop.
# Copyright (c) 2013 Georgios Gousios
# MIT-licensed
# Edited to handle two new tables: postLinks and tags that are present in the 2014 dataset
create database stackoverflow DEFAULT CHARACTER SET utf8 DEFAULT COLLATE utf8_general_ci;
use stackoverflow;
create table badges (
Id INT NOT NULL PRIMARY KEY,
UserId INT,
Name VARCHAR(50),
Date DATETIME
);
CREATE TABLE comments (
Id INT NOT NULL PRIMARY KEY,
PostId INT NOT NULL,
Score INT NOT NULL DEFAULT 0,
Text TEXT,
CreationDate DATETIME,
UserId INT NOT NULL
);
CREATE TABLE post_history (
Id INT NOT NULL PRIMARY KEY,
PostHistoryTypeId SMALLINT NOT NULL,
PostId INT NOT NULL,
RevisionGUID VARCHAR(36),
CreationDate DATETIME,
UserId INT NOT NULL,
Text TEXT
);
CREATE TABLE post_links (
Id INT NOT NULL PRIMARY KEY,
CreationDate DATETIME DEFAULT NULL,
PostId INT NOT NULL,
RelatedPostId INT NOT NULL,
LinkTypeId INT DEFAULT NULL
);
CREATE TABLE posts (
Id INT NOT NULL PRIMARY KEY,
PostTypeId SMALLINT,
AcceptedAnswerId INT,
ParentId INT,
Score INT NULL,
ViewCount INT NULL,
Body text NULL,
OwnerUserId INT NOT NULL,
LastEditorUserId INT,
LastEditDate DATETIME,
LastActivityDate DATETIME,
Title varchar(256) NOT NULL,
Tags VARCHAR(256),
AnswerCount INT NOT NULL DEFAULT 0,
CommentCount INT NOT NULL DEFAULT 0,
FavoriteCount INT NOT NULL DEFAULT 0,
CreationDate DATETIME
);
CREATE TABLE tags (
Id INT NOT NULL PRIMARY KEY,
TagName VARCHAR(50) CHARACTER SET latin1 DEFAULT NULL,
Count INT DEFAULT NULL,
ExcerptPostId INT DEFAULT NULL,
WikiPostId INT DEFAULT NULL
);
CREATE TABLE users (
Id INT NOT NULL PRIMARY KEY,
Reputation INT NOT NULL,
CreationDate DATETIME,
DisplayName VARCHAR(50) NULL,
LastAccessDate DATETIME,
Views INT DEFAULT 0,
WebsiteUrl VARCHAR(256) NULL,
Location VARCHAR(256) NULL,
AboutMe TEXT NULL,
Age INT,
UpVotes INT,
DownVotes INT,
EmailHash VARCHAR(32)
);
CREATE TABLE votes (
Id INT NOT NULL PRIMARY KEY,
PostId INT NOT NULL,
VoteTypeId SMALLINT,
CreationDate DATETIME
);
load data local infile 'Badges.xml'
into table badges
rows identified by '<row>';
load data local infile 'Comments.xml'
into table comments
rows identified by '<row>';
load data local infile 'PostHistory.xml'
into table post_history
rows identified by '<row>';
load data local infile 'PostLinks.xml'
into table post_links
rows identified BY '<row>';
load data local infile 'Posts.xml'
into table posts
rows identified by '<row>';
load data local infile 'Tags.xml'
into table tags
rows identified BY '<row>';
load data local infile 'Users.xml'
into table users
rows identified by '<row>';
load data local infile 'Votes.xml'
into table votes
rows identified by '<row>';
create index badges_idx_1 on badges(UserId);
create index comments_idx_1 on comments(PostId);
create index comments_idx_2 on comments(UserId);
create index post_history_idx_1 on post_history(PostId);
create index post_history_idx_2 on post_history(UserId);
create index posts_idx_1 on posts(AcceptedAnswerId);
create index posts_idx_2 on posts(ParentId);
create index posts_idx_3 on posts(OwnerUserId);
create index posts_idx_4 on posts(LastEditorUserId);
create index votes_idx_1 on votes(PostId);
#!/usr/bin/env ruby
# Copyright (c) 2013 gousiousg and megansquire
# MIT-licensed
require 'sequel'
require 'mysql2'
require 'cgi'
Sequel::Database.extension :pagination
DB = Sequel.connect("mysql2://#{ARGV[1]}:#{ARGV[2]}@#{ARGV[0]}/#{ARGV[3]}")
table = ARGV[4].to_sym
field = ARGV[5].to_sym
lines = 0
lines_with_escapes = 0
DB.from(table.to_sym).select(:Id, field).each_page(1000) do |page|
page.each do |row|
lines += 1
text = row[field]
next if text.nil?
text_unescaped = CGI.unescapeHTML(text)
if text_unescaped != text
lines_with_escapes += 1
DB[table].filter(:Id => row[:Id]).update(field => text_unescaped)
end
print "\r #{lines} lines, #{lines_with_escapes} with escapes"
end
end

MySQL script to import stackexchange data dumps (as retrieved from here).

The 2015 data is available on Archive.org here: https://archive.org/details/stackexchange

The accompanying Ruby script will un-escape various XML-escaped characters that the MySQL import process fails to do.

@yakzan
Copy link

yakzan commented Aug 31, 2016

Thanks for the scripts!

I have two suggestions:

  1. 'load data local infile' should be replaced by 'load xml local infile'; otherwise the load procedure does not succeed: it either runs forever or adds only one row after 15 minutes.
  2. The original gist by megansquire has a OwnerDisplayName field in the posts table, which is missing here; the xml files in the latest dumps still contain this field.

@enricorotundo
Copy link

Hi, see my fork to import the dataset version Sept. 12, 2016 from https://archive.org/details/stackexchange.

I added some columns and changed some NOT NULLs.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment