Skip to content

Instantly share code, notes, and snippets.

@enricorotundo
Forked from gousiosg/stackoverflow.sql
Last active November 12, 2022 16:57
  • Star 7 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save enricorotundo/1e074af39d90629252a7df3fc1066397 to your computer and use it in GitHub Desktop.
Script to import the stackexchange dumps into MySQL
# Copyright (c) 2013 Georgios Gousios
# MIT-licensed
# [tundo91]
# * Edited to import local XML files
# * Updated creat table procedures to support Sept.-12-2016 dataset dump version
create database cooking_stackexchange_com DEFAULT CHARACTER SET utf8 DEFAULT COLLATE utf8_general_ci;
use cooking_stackexchange_com;
create table Badges (
Id INT NOT NULL PRIMARY KEY,
UserId INT,
Name VARCHAR(50),
Date DATETIME,
Class SMALLINT,
TagBased BIT(64)
);
CREATE TABLE Comments (
Id INT NOT NULL PRIMARY KEY,
PostId INT NOT NULL,
Score INT NOT NULL DEFAULT 0,
Text TEXT,
CreationDate DATETIME,
UserDisplayName VARCHAR(30),
UserId INT
);
CREATE TABLE PostHistory (
Id INT NOT NULL PRIMARY KEY,
PostHistoryTypeId SMALLINT NOT NULL,
PostId INT NOT NULL,
RevisionGUID VARCHAR(36) NOT NULL,
CreationDate DATETIME NOT NULL,
UserId INT,
UserDisplayName VARCHAR(40),
Comment VARCHAR(400),
Text TEXT
);
CREATE TABLE PostLinks (
Id INT NOT NULL PRIMARY KEY,
CreationDate DATETIME DEFAULT NULL,
PostId INT NOT NULL,
RelatedPostId INT NOT NULL,
LinkTypeId SMALLINT NOT NULL
);
CREATE TABLE Posts (
Id INT NOT NULL PRIMARY KEY,
PostTypeId TINYINT NOT NULL ,
AcceptedAnswerId INT,
ParentId INT,
CreationDate DATETIME NOT NULL,
DeletionDate DATETIME,
Score INT NULL,
ViewCount INT NULL,
Body text NULL,
OwnerUserId INT,
OwnerDisplayName varchar(256),
LastEditorUserId INT,
LastEditorDisplayName VARCHAR(40),
LastEditDate DATETIME,
LastActivityDate DATETIME,
Title varchar(256),
Tags VARCHAR(256),
AnswerCount INT DEFAULT 0,
CommentCount INT DEFAULT 0,
FavoriteCount INT DEFAULT 0,
ClosedDate DATETIME,
CommunityOwnedDate DATETIME
);
CREATE TABLE Tags (
Id INT NOT NULL PRIMARY KEY,
TagName VARCHAR(50) CHARACTER SET latin1 DEFAULT NULL,
Count INT DEFAULT NULL,
ExcerptPostId INT DEFAULT NULL,
WikiPostId INT DEFAULT NULL
);
CREATE TABLE Users (
Id INT NOT NULL PRIMARY KEY,
Reputation INT NOT NULL,
CreationDate DATETIME,
DisplayName VARCHAR(40) NULL,
LastAccessDate DATETIME NOT NULL,
WebsiteUrl VARCHAR(256) NULL,
Location VARCHAR(256) NULL,
AboutMe TEXT NULL,
Views INT DEFAULT 0,
UpVotes INT,
DownVotes INT,
ProfileImageUrl VARCHAR(200) NULL,
EmailHash VARCHAR(32),
Age INT NULL,
AccountId INT NULL
);
CREATE TABLE Votes (
Id INT NOT NULL PRIMARY KEY,
PostId INT NOT NULL,
VoteTypeId SMALLINT NOT NULL,
UserId INT,
CreationDate DATETIME,
BountyAmount INT
);
load xml infile '/Users/erotundo/PycharmProjects/DatasetAnnotator/data_source/stackexchange_v12Sept2016/cooking.stackexchange.com/Badges.xml'
into table Badges
rows identified by '<row>';
load xml infile '/Users/erotundo/PycharmProjects/DatasetAnnotator/data_source/stackexchange_v12Sept2016/cooking.stackexchange.com/Comments.xml'
into table Comments
rows identified by '<row>';
load xml infile '/Users/erotundo/PycharmProjects/DatasetAnnotator/data_source/stackexchange_v12Sept2016/cooking.stackexchange.com/PostHistory.xml'
into table PostHistory
rows identified by '<row>';
load xml infile '/Users/erotundo/PycharmProjects/DatasetAnnotator/data_source/stackexchange_v12Sept2016/cooking.stackexchange.com/PostLinks.xml'
INTO TABLE PostLinks
ROWS IDENTIFIED BY '<row>';
load xml infile '/Users/erotundo/PycharmProjects/DatasetAnnotator/data_source/stackexchange_v12Sept2016/cooking.stackexchange.com/Posts.xml'
into table Posts
rows identified by '<row>';
load xml infile '/Users/erotundo/PycharmProjects/DatasetAnnotator/data_source/stackexchange_v12Sept2016/cooking.stackexchange.com/Tags.xml'
INTO TABLE Tags
ROWS IDENTIFIED BY '<row>';
load xml infile '/Users/erotundo/PycharmProjects/DatasetAnnotator/data_source/stackexchange_v12Sept2016/cooking.stackexchange.com/Users.xml'
into table Users
rows identified by '<row>';
load xml infile '/Users/erotundo/PycharmProjects/DatasetAnnotator/data_source/stackexchange_v12Sept2016/cooking.stackexchange.com/Votes.xml'
into table Votes
rows identified by '<row>';
create index Badges_idx_1 on Badges(UserId);
create index Comments_idx_1 on Comments(PostId);
create index Comments_idx_2 on Comments(UserId);
create index Post_history_idx_1 on PostHistory(PostId);
create index Post_history_idx_2 on PostHistory(UserId);
create index Posts_idx_1 on Posts(AcceptedAnswerId);
create index Posts_idx_2 on Posts(ParentId);
create index Posts_idx_3 on Posts(OwnerUserId);
create index Posts_idx_4 on Posts(LastEditorUserId);
create index Votes_idx_1 on Votes(PostId);
@fkenjikamei
Copy link

fkenjikamei commented Feb 10, 2017

Do you know another link to download? Because the archive.org is too slow....

@muldon
Copy link

muldon commented Feb 17, 2017

very good job ! Thanks !

@fkenjikamei
Copy link

Anyone use Mac OS? I had a problem:
#13 - Can't get stat of '/Users/kenji/Documents/stackoverflow2016/Badges.xml' (Errcode: 13 "Permission denied")

@RoelVdP
Copy link

RoelVdP commented Aug 8, 2019

Suggest using Body MEDIUMTEXT NULL instead of just TEXT. Only one more byte overhead, but much more space in case question or answer is longer (16 MiB instead of 64 KiB possible). LONGTEXT (4 GiB) is likely overdoing it :) even though only one more additional byte.

@RoelVdP
Copy link

RoelVdP commented Aug 8, 2019

Two comments if you want to use this;

  1. Make sure to add --secure_file_priv= to the mysqld startup options (or secure_file_priv= in my.cnf under [mysqld])
  2. Add this to the top of the SQL file; SET @@SESSION.SQL_MODE=''; this is especially important for late 5.7 and 8.0 versions as they have the STRICT_TRANS_TABLES SQL_MODE turned on by default (and this results in 'ERROR 1263 (22004): Column set to default value; NULL supplied to NOT NULL column 'id' at row 1'. For more info, see https://dev.mysql.com/doc/refman/5.7/en/sql-mode.html#sqlmode_strict_trans_tables

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment