Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@ayende
Last active August 18, 2022 05:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ayende/be1a5a2704c2e9dfd3a8a581a29c09e0 to your computer and use it in GitHub Desktop.
Save ayende/be1a5a2704c2e9dfd3a8a581a29c09e0 to your computer and use it in GitHub Desktop.
using System.Xml;
using System;
using Newtonsoft.Json;
using System.Xml.Serialization;
using SharpCompress.Archives;
using System.IO;
using Raven.Client.Documents;
using System.Threading.Tasks;
using System.Collections.Generic;
using System.Diagnostics;
using Raven.Client.Documents.Session;
using System.Linq;
using var store = new DocumentStore
{
Urls = new[] { "http://127.0.0.2:8080" },
Database = "StackOverflow",
Conventions =
{
MaxNumberOfRequestsPerSession = int.MaxValue
}
}.Initialize();
var files = new[]
{
@"C:\Users\Oren\Downloads\stackoverflow.com-Posts.7z",
@"C:\Users\Oren\Downloads\stackoverflow.com-Comments.7z",
@"C:\Users\Oren\Downloads\stackoverflow.com-Users.7z",
@"C:\Users\Oren\Downloads\stackoverflow.com-Badges.7z",
};
var sp = Stopwatch.StartNew();
var answerToQuestion = new Dictionary<int, int>();
var state = new State { Session = store.OpenAsyncSession() };
foreach (var file in files)
{
foreach (var user in ImportFrom<xUser>("Users", file))
{
await Import(user, store, answerToQuestion, state);
}
foreach (var post in ImportFrom<xPost>("Posts", file))
{
if (post.PostTypeId == 2)
{
answerToQuestion[post.Id] = post.ParentId;
}
if (post.PostTypeId is 1 or 2)
{
await Import(post, store, answerToQuestion, state);
}
}
foreach (var comment in ImportFrom<xComment>("Comments", file))
{
await Import(comment, store, answerToQuestion, state);
}
foreach (var badge in ImportFrom<xBadge>("Badges", file))
{
await Import(badge, store, answerToQuestion, state);
}
}
await state.Session.SaveChangesAsync();
Console.WriteLine(sp.Elapsed);
async Task Import(object item, IDocumentStore store, Dictionary<int, int> answerToQuestion, State state)
{
if (state.Writes > 1024)
{
await state.Session.SaveChangesAsync();
state.Session.Dispose();
state.Session = store.OpenAsyncSession();
state.Writes = 0;
}
state.Writes++;
switch (item)
{
case xPost post:
{
switch (post.PostTypeId)
{
case 1: // question
{
var q = new Question
{
Id = "questions/" + post.Id,
AcceptedAnswerId = post.AcceptedAnswerId,
Answers = new(),
Body = post.Body,
Comments = new(),
CreationDate = post.CreationDate,
FavoriteCount = post.FavoriteCount,
LastEditDate = post.LastEditDate == DateTime.MinValue ? null : post.LastEditDate,
LastEditor = post.LastEditorUserId == 0 ? null : "users/" + post.LastEditorUserId,
Owner = "users/" + post.OwnerUserId,
Score = post.Score,
Tags = post.Tags.Split(new[]{'<','>'}, StringSplitOptions.RemoveEmptyEntries),
Title = post.Title,
ViewCount = post.ViewCount
};
if (state.OutOfOrders.Remove(post.Id, out var ooo))
{
q.Answers = ooo;
}
await state.Session.StoreAsync(q, "questions/" + post.Id);
break;
}
case 2:// answer
{
var a = new Answer
{
Body = post.Body,
Comments = new(),
CreationDate = post.CreationDate,
Id = post.Id,
LastEditDate = post.LastEditDate == DateTime.MinValue ? null : post.LastEditDate,
LastEditor = post.LastEditorUserId == 0 ? null : "users/" + post.LastEditorUserId,
Owner = "users/" + post.OwnerUserId,
Score = post.Score,
ViewCount = post.ViewCount
};
answerToQuestion[post.Id] = post.ParentId;
var q = await state.Session.LoadAsync<Question>("questions/" + post.ParentId);
if (q == null)
{
if (state.OutOfOrders.TryGetValue(post.ParentId, out var ooo) == false)
{
state.OutOfOrders[post.ParentId] = ooo = new List<Answer>();
}
ooo.Add(a);
return;
}
q.Answers.Add(a);
break;
}
}
break;
}
case xComment comment:
{
var c = new Comment
{
CreationDate = comment.CreationDate,
Id = comment.Id,
Score = comment.Score,
Text = comment.Text,
User = "users/" + comment.UserId,
};
if (answerToQuestion.TryGetValue(comment.PostId, out var qId))
{
var q = await state.Session.LoadAsync<Question>("questions/" + qId);
if (q == null)
{
Console.WriteLine("Missing question's answer: " + qId + " -> " + comment.PostId);
return;
}
try
{
q.Answers.Single(a => a.Id == comment.PostId).Comments.Add(c);
}
catch (Exception e)
{
Console.WriteLine(e);
Console.WriteLine(comment.PostId + " -> " + qId);
}
}
else
{
var q = await state.Session.LoadAsync<Question>("questions/" + comment.PostId);
if (q == null)
{
Console.WriteLine("Missing question: " + comment.PostId);
return;
}
q.Comments.Add(c);
}
break;
}
case xUser user:
{
var u = new User
{
CreationDate = user.CreationDate,
Id = "users/" + user.Id,
AboutMe = user.AboutMe,
DisplayName = user.DisplayName,
DownVotes = user.DownVotes,
LastAccessDate = user.LastAccessDate,
Reputation = user.Reputation,
UpVotes = user.UpVotes,
Views = user.UpVotes,
Badges = new()
};
await state.Session.StoreAsync(u, "users/" + user.Id);
break;
}
case xBadge badge:
{
var u = await state.Session.LoadAsync<User>("users/" + badge.UserId);
if (u == null) return;
u.Badges.Add(new Badge
{
Date = badge.Date,
Name = badge.Name,
Rank = badge.Class switch
{
1 => "Gold",
2 => "Silver",
3 => "Bronze",
_ => null
},
TagBased = bool.Parse(badge.TagBased)
});
break;
}
}
}
IEnumerable<T> ImportFrom<T>(string source, string file)
{
Console.WriteLine("Importing " + source + " from " + file);
using var archive = ArchiveFactory.Open(file);
using var bulk = store.BulkInsert();
foreach (var entry in archive.Entries)
{
if (Path.GetFileNameWithoutExtension(entry.Key) != source)
continue;
XmlReader reader = GetReader(entry);
XmlSerializer serializer = new XmlSerializer(typeof(T));
while (reader.Read())
{
if (reader.NodeType != XmlNodeType.Element)
continue;
yield return (T)serializer.Deserialize(reader);
}
}
}
static XmlReader GetReader(IArchiveEntry entry)
{
var stream = entry.OpenEntryStream();
var reader = XmlReader.Create(stream);
reader.MoveToContent();
return reader;
}
public class State
{
public IAsyncDocumentSession Session;
public int Writes;
public Dictionary<int, List<Answer>> OutOfOrders = new();
}
[XmlRoot(ElementName = "row")]
public class xUser
{
[JsonIgnore]
[XmlAttribute(AttributeName = "Id")]
public int Id { get; set; }
[XmlAttribute(AttributeName = "Reputation")]
public int Reputation { get; set; }
[XmlAttribute(AttributeName = "CreationDate")]
public DateTime CreationDate { get; set; }
[XmlAttribute(AttributeName = "DisplayName")]
public string DisplayName { get; set; }
[XmlAttribute(AttributeName = "LastAccessDate")]
public DateTime LastAccessDate { get; set; }
[XmlAttribute(AttributeName = "AboutMe")]
public string AboutMe { get; set; }
[XmlAttribute(AttributeName = "Views")]
public int Views { get; set; }
[XmlAttribute(AttributeName = "UpVotes")]
public int UpVotes { get; set; }
[XmlAttribute(AttributeName = "DownVotes")]
public int DownVotes { get; set; }
}
[XmlRoot(ElementName = "row")]
public class xTag
{
[XmlAttribute(AttributeName = "Id")]
public int Id { get; set; }
[XmlAttribute(AttributeName = "TagName")]
public string TagName { get; set; }
[XmlAttribute(AttributeName = "Count")]
public int Count { get; set; }
[XmlAttribute(AttributeName = "ExcerptPostId")]
public int ExcerptPostId { get; set; }
[XmlAttribute(AttributeName = "WikiPostId")]
public int WikiPostId { get; set; }
}
[XmlRoot(ElementName = "row")]
public class xBadge
{
[XmlAttribute(AttributeName = "Id")]
public int Id { get; set; }
[XmlAttribute(AttributeName = "UserId")]
public int UserId { get; set; }
[XmlAttribute(AttributeName = "Name")]
public string Name { get; set; }
[XmlAttribute(AttributeName = "Date")]
public DateTime Date { get; set; }
[XmlAttribute(AttributeName = "Class")]
public int Class { get; set; }
[XmlAttribute(AttributeName = "TagBased")]
public string TagBased { get; set; }
}
[XmlRoot(ElementName = "row")]
public class xComment
{
[XmlAttribute(AttributeName = "Id")]
public int Id { get; set; }
[XmlAttribute(AttributeName = "PostId")]
public int PostId { get; set; }
[XmlAttribute(AttributeName = "Score")]
public int Score { get; set; }
[XmlAttribute(AttributeName = "Text")]
public string Text { get; set; }
[XmlAttribute(AttributeName = "CreationDate")]
public DateTime CreationDate { get; set; }
[XmlAttribute(AttributeName = "UserId")]
public int UserId { get; set; }
[XmlAttribute(AttributeName = "ContentLicense")]
public string ContentLicense { get; set; }
}
[XmlRoot(ElementName = "row")]
public class xPostHistory
{
[XmlAttribute(AttributeName = "Id")]
public int Id { get; set; }
[XmlAttribute(AttributeName = "PostHistoryTypeId")]
public int PostHistoryTypeId { get; set; }
[XmlAttribute(AttributeName = "PostId")]
public int PostId { get; set; }
[XmlAttribute(AttributeName = "RevisionGUID")]
public string RevisionGUID { get; set; }
[XmlAttribute(AttributeName = "CreationDate")]
public DateTime CreationDate { get; set; }
[XmlAttribute(AttributeName = "UserId")]
public int UserId { get; set; }
[XmlAttribute(AttributeName = "Text")]
public string Text { get; set; }
[XmlAttribute(AttributeName = "ContentLicense")]
public string ContentLicense { get; set; }
}
[XmlRoot(ElementName = "row")]
public class xPostLink
{
[XmlAttribute(AttributeName = "Id")]
public int Id { get; set; }
[XmlAttribute(AttributeName = "CreationDate")]
public DateTime CreationDate { get; set; }
[XmlAttribute(AttributeName = "PostId")]
public int PostId { get; set; }
[XmlAttribute(AttributeName = "RelatedPostId")]
public int RelatedPostId { get; set; }
[XmlAttribute(AttributeName = "LinkTypeId")]
public int LinkTypeId { get; set; }
}
[XmlRoot(ElementName = "row")]
public class xPost
{
[XmlAttribute(AttributeName = "Id")]
public int Id { get; set; }
[XmlAttribute(AttributeName = "PostTypeId")]
public int PostTypeId { get; set; }
[XmlAttribute(AttributeName = "ParentId")]
public int ParentId { get; set; }
[XmlAttribute(AttributeName = "AcceptedAnswerId")]
public int AcceptedAnswerId { get; set; }
[XmlAttribute(AttributeName = "CreationDate")]
public DateTime CreationDate { get; set; }
[XmlAttribute(AttributeName = "Score")]
public int Score { get; set; }
[XmlAttribute(AttributeName = "ViewCount")]
public int ViewCount { get; set; }
[XmlAttribute(AttributeName = "Body")]
public string Body { get; set; }
[XmlAttribute(AttributeName = "OwnerUserId")]
public int OwnerUserId { get; set; }
[XmlAttribute(AttributeName = "LastEditorUserId")]
public int LastEditorUserId { get; set; }
[XmlAttribute(AttributeName = "LastEditDate")]
public DateTime LastEditDate { get; set; }
[XmlAttribute(AttributeName = "LastActivityDate")]
public DateTime LastActivityDate { get; set; }
[XmlAttribute(AttributeName = "Title")]
public string Title { get; set; }
[XmlAttribute(AttributeName = "Tags")]
public string Tags { get; set; }
[XmlAttribute(AttributeName = "AnswerCount")]
public int AnswerCount { get; set; }
[XmlAttribute(AttributeName = "CommentCount")]
public int CommentCount { get; set; }
[XmlAttribute(AttributeName = "FavoriteCount")]
public int FavoriteCount { get; set; }
[XmlAttribute(AttributeName = "ContentLicense")]
public string ContentLicense { get; set; }
}
[XmlRoot(ElementName = "row")]
public class xVote
{
[XmlAttribute(AttributeName = "Id")]
public int Id { get; set; }
[XmlAttribute(AttributeName = "PostId")]
public int PostId { get; set; }
[XmlAttribute(AttributeName = "VoteTypeId")]
public int VoteTypeId { get; set; }
[XmlAttribute(AttributeName = "CreationDate")]
public DateTime CreationDate { get; set; }
}
public class Question
{
public string Id;
public int AcceptedAnswerId;
public DateTime CreationDate;
public int Score;
public int ViewCount;
public string Body;
public string Owner;
public string LastEditor;
public DateTime? LastEditDate;
public string Title;
public string[] Tags;
public int FavoriteCount;
public List<Answer> Answers;
public List<Comment> Comments;
}
public class Answer
{
public int Id;
public DateTime CreationDate;
public int Score;
public int ViewCount;
public string Body;
public string Owner;
public string LastEditor;
public DateTime? LastEditDate;
public List<Comment> Comments;
}
public class Comment
{
public int Id;
public int Score;
public string Text;
public DateTime CreationDate;
public string User;
}
public class User
{
public string Id;
public int Reputation;
public DateTime CreationDate;
public string DisplayName;
public DateTime LastAccessDate;
public string AboutMe;
public int Views;
public int UpVotes;
public int DownVotes;
public List<Badge> Badges;
}
public class Badge
{
public string Name;
public DateTime Date;
public bool TagBased;
public string Rank;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment