Skip to content

Instantly share code, notes, and snippets.

@jesuslpm
Last active September 30, 2016 07:21
Show Gist options
  • Save jesuslpm/07fb8121b1747bd69fb43581c4a576d9 to your computer and use it in GitHub Desktop.
Save jesuslpm/07fb8121b1747bd69fb43581c4a576d9 to your computer and use it in GitHub Desktop.
StackOverflow THAT
<?xml version="1.0" encoding="utf-8" ?>
<rows>
<row Id="4"
PostTypeId="1"
Body="This is the body of question 4"
Title="When setting a form's opacity should I use a decimal or double"
AnswerCount="5"
/>
<row Id="6"
PostTypeId="1"
Body="This is the body of question 6"
Title="Percentage width child element in absolutely positioned parent on Internet Explorer 7"
AnswerCount="5"
/>
<row Id="7"
PostTypeId="1"
ParentId="4"
Body="This is the body of answer 7 of question 4"
/>
</rows>
using System;
using System.Collections.Generic;
using System.IO;
using System.Xml;
using System.Text;
using System.Xml.Linq;
using System.IO.Compression;
namespace StackOverflowThat
{
class Program
{
static void Main(string[] args)
{
PreparationPhase();
OutputPhase();
}
static void PreparationPhase()
{
var postsFilePath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Posts.xml");
using (var textReader = new StreamReader(postsFilePath, Encoding.UTF8))
using (var xmlReader = new XmlTextReader(textReader))
{
foreach (var row in EnumerateRows(xmlReader))
{
ProcessRow(row);
}
}
}
static IEnumerable<XElement> EnumerateRows(XmlReader reader)
{
reader.MoveToContent();
while (reader.Read())
{
if (reader.NodeType == XmlNodeType.Element)
{
yield return XNode.ReadFrom(reader) as XElement;
}
}
}
static void ProcessRow(XElement row)
{
var parentId = (string)row.Attribute("ParentId");
if (parentId == null)
{
var filePath = AppDomain.CurrentDomain.BaseDirectory + GetFilePath((string)row.Attribute("Id"));
var folderPath = Path.GetDirectoryName(filePath);
if (!Directory.Exists(folderPath)) Directory.CreateDirectory(folderPath);
row.Name = "question";
File.WriteAllText(filePath, row.ToString(), Encoding.UTF8);
}
else
{
var filePath = AppDomain.CurrentDomain.BaseDirectory + GetFilePath(parentId);
XElement question;
using (var reader = new StreamReader(filePath, Encoding.UTF8))
{
question = XElement.Load(reader);
}
row.Name = "answer";
question.Add(row);
File.WriteAllText(filePath, question.ToString(), Encoding.UTF8);
}
}
/// <summary>
/// Gets the file path given the row id.
/// Files are organized in folders, each folder can have up to 1000 files
/// Large id's produce deeper paths.
/// For example: id = 1234567, path = 1\234\567.xml
/// I do this to avoid to have too many files in a single folder.
/// </summary>
/// <param name="id"></param>
/// <returns></returns>
static string GetFilePath(string id)
{
string path = ".xml";
string remainingString = id;
while (remainingString.Length > 0)
{
var startIndex = remainingString.Length - 3;
if (startIndex < 0) startIndex = 0;
path = "\\" + remainingString.Substring(startIndex) + path;
remainingString = remainingString.Substring(0, startIndex);
}
return path;
}
static void OutputPhase()
{
var outputFilePath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Ouput.gzip");
using (var stream = File.Open(outputFilePath, FileMode.Create, FileAccess.Write))
using (var gzip = new GZipStream(stream, CompressionMode.Compress))
{
AddXmlFilesToStream(AppDomain.CurrentDomain.BaseDirectory, gzip);
}
}
static void AddXmlFilesToStream(string folderPath, Stream output)
{
foreach (var filePath in Directory.EnumerateFiles(folderPath, "*.xml"))
{
using (var file = File.OpenRead(filePath))
{
file.CopyTo(output);
}
}
foreach (var subDirectory in Directory.EnumerateDirectories(folderPath))
{
AddXmlFilesToStream(subDirectory, output);
}
}
}
}
using System;
using System.Collections.Generic;
using System.IO;
using System.Xml;
using System.Text;
using System.Xml.Linq;
using System.IO.Compression;
namespace StackOverflowThat
{
class Program
{
static void Main(string[] args)
{
PreparationPhase();
OutputPhase();
}
static void PreparationPhase()
{
WriteQuestionFiles();
AddAnswersToQuestions();
}
static void WriteQuestionFiles()
{
foreach (var row in EnumerateRows())
{
var parentId = (string)row.Attribute("ParentId");
if (parentId == null)
{
var filePath = AppDomain.CurrentDomain.BaseDirectory + GetFilePath((string)row.Attribute("Id"));
var folderPath = Path.GetDirectoryName(filePath);
if (!Directory.Exists(folderPath)) Directory.CreateDirectory(folderPath);
row.Name = "question";
File.WriteAllText(filePath, row.ToString(), Encoding.UTF8);
}
}
}
static void AddAnswersToQuestions()
{
foreach (var row in EnumerateRows())
{
var parentId = (string)row.Attribute("ParentId");
if (parentId != null)
{
var filePath = AppDomain.CurrentDomain.BaseDirectory + GetFilePath(parentId);
XElement question;
using (var reader = new StreamReader(filePath, Encoding.UTF8))
{
question = XElement.Load(reader);
}
row.Name = "answer";
question.Add(row);
File.WriteAllText(filePath, question.ToString(), Encoding.UTF8);
}
}
}
static IEnumerable<XElement> EnumerateRows()
{
var postsFilePath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Posts.xml");
using (var textReader = new StreamReader(postsFilePath, Encoding.UTF8))
using (var reader = new XmlTextReader(textReader))
{
reader.MoveToContent();
while (reader.Read())
{
if (reader.NodeType == XmlNodeType.Element)
{
yield return XNode.ReadFrom(reader) as XElement;
}
}
}
}
/// <summary>
/// Gets the file path given the row id.
/// Files are organized in folders, each folder can have up to 1000 files
/// Large id's produce deeper paths.
/// For example: id = 1234567, path = 1\234\567.xml
/// I do this to avoid to have too many files in a single folder.
/// </summary>
/// <param name="id"></param>
/// <returns></returns>
static string GetFilePath(string id)
{
string path = ".xml";
string remainingString = id;
while (remainingString.Length > 0)
{
var startIndex = remainingString.Length - 3;
if (startIndex < 0) startIndex = 0;
path = "\\" + remainingString.Substring(startIndex) + path;
remainingString = remainingString.Substring(0, startIndex);
}
return path;
}
static void OutputPhase()
{
var outputFilePath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Ouput.gzip");
using (var stream = File.Open(outputFilePath, FileMode.Create, FileAccess.Write))
using (var gzip = new GZipStream(stream, CompressionMode.Compress))
{
AddXmlFilesToStream(AppDomain.CurrentDomain.BaseDirectory, gzip);
}
}
static void AddXmlFilesToStream(string folderPath, Stream output)
{
foreach (var filePath in Directory.EnumerateFiles(folderPath, "*.xml"))
{
using (var file = File.OpenRead(filePath))
{
file.CopyTo(output);
}
}
foreach (var subDirectory in Directory.EnumerateDirectories(folderPath))
{
AddXmlFilesToStream(subDirectory, output);
}
}
}
}
@jesuslpm
Copy link
Author

This is the response to this interview question from ayende https://ayende.com/blog/175617/interview-question-stackoverflow-that

@jesuslpm
Copy link
Author

jesuslpm commented Sep 30, 2016

Program.cs assumes answers come later than questions in posts.xml. Program2.cs doesn't, therefore it needs to read posts.xml twice.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment