Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
/// <summary>
/// Reads articles from a Wikipedia dump. The file currently must be BUnzipped. XML is assumed to be valid. <seealso cref="https://dumps.wikimedia.org/enwiki/"/>
/// </summary>
/// <param name="filename">An unzipped Wikipedia dump</param>
/// <returns>An IEnumerable of articles from the XML file</returns>
public static IEnumerable<WikipediaArticle> ReadArticlesFromXmlDump(string filename)
{
var settings = new XmlReaderSettings()
{
ValidationType = ValidationType.None,
ConformanceLevel = ConformanceLevel.Fragment
};
XmlReader x = XmlTextReader.Create(new StreamReader(filename), settings);
while (x.ReadToFollowing("page"))
{
if (x.NodeType == XmlNodeType.Element)
{
var article = ReadArticle(x);
yield return article;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.