Created
April 10, 2021 14:09
-
-
Save theolivenbaum/f92227a091d475e6325a4ef9f6faebf2 to your computer and use it in GitHub Desktop.
Yahoo! Answers Sitemap crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Net.Http; | |
using System.Threading.Tasks; | |
using System.Xml.Serialization; | |
using System.Linq; | |
using System.IO.Compression; | |
using System.Threading; | |
using System.Diagnostics; | |
using System.ComponentModel; | |
var serializerMap = new XmlSerializer(typeof(Model.sitemapindex)); | |
var serializerUrls = new XmlSerializer(typeof(Model.urlset)); | |
var dataPath = @"/yahoo-answers-dataset/"; | |
var mapsPath = Path.Combine(dataPath, "maps"); Directory.CreateDirectory(mapsPath); | |
var dumpPath = Path.Combine(dataPath, "dump"); Directory.CreateDirectory(dumpPath); | |
using var client = new HttpClient(); | |
var sitemap = await client.GetStreamAsync("https://answers.yahoo.com/sitemaps/sitemap-us.xml"); | |
var map = serializerMap.Deserialize(sitemap) as Model.sitemapindex; | |
var tasks = new List<Task>(); | |
foreach (var sm in map.sitemap) | |
{ | |
var submap = sm; | |
tasks.Add(Task.Run(async () => | |
{ | |
try | |
{ | |
var submapName = Path.GetFileName(submap.loc); | |
var file = Path.Combine(mapsPath, submapName) + ".txt"; | |
if (File.Exists(file)) return; | |
var submapStream = await client.GetStreamAsync(submap.loc); | |
using var gz = new GZipStream(submapStream, CompressionMode.Decompress); | |
var answers = serializerUrls.Deserialize(gz) as Model.urlset; | |
await File.WriteAllLinesAsync(file, answers.url.Select(a => a.loc)); | |
Console.WriteLine($"Read map {submap.loc}"); | |
} | |
catch | |
{ | |
Console.WriteLine($"Fail to read map {submap.loc}"); | |
} | |
})); | |
if (tasks.Count > 10) | |
{ | |
await Task.WhenAny(tasks); tasks.RemoveAll(t => t.IsCompleted); | |
} | |
} | |
await Task.WhenAll(tasks); | |
tasks.Clear(); | |
long count = 0; long total = EnumerateMaps(mapsPath).Count(); | |
var sw = Stopwatch.StartNew(); | |
foreach(var u in EnumerateMaps(mapsPath)) | |
{ | |
var url = u; | |
tasks.Add(Task.Run(async () => | |
{ | |
var c = Interlocked.Increment(ref count); | |
var qid = url.Split("=")[1].Split("<")[0]; | |
var year = qid.Substring(0, 4); | |
var month = qid.Substring(4, 2); | |
var day = qid.Substring(6, 2); | |
var targetPath = Path.Combine(dumpPath, year,month,day); | |
var targetFile = Path.Combine(targetPath, qid + ".html"); | |
Directory.CreateDirectory(targetPath); | |
if (File.Exists(targetFile)) return; | |
var s = await client.GetStreamAsync(url); | |
using var f = File.OpenWrite(targetFile); | |
await s.CopyToAsync(f); | |
if (c % 1000 == 0) | |
{ | |
Console.WriteLine($"Read {c:n0} of {total:n0}, at {100f*c/total:n1}% - last read {year}/{month}/{day} qid={qid} at {1000 / sw.Elapsed.TotalSeconds:n1} answers/s"); | |
sw.Restart(); | |
} | |
})); | |
if(tasks.Count > 64) | |
{ | |
await Task.WhenAny(tasks); tasks.RemoveAll(t => t.IsCompleted); | |
} | |
} | |
await Task.WhenAll(tasks); | |
private static IEnumerable<string> EnumerateMaps(string mapsPath) | |
{ | |
foreach(var file in Directory.EnumerateFiles(mapsPath)) | |
{ | |
using (var f = File.OpenRead(file)) | |
using (var r = new StreamReader(f)) | |
{ | |
while (!r.EndOfStream) | |
{ | |
var l = r.ReadLine(); | |
if (!string.IsNullOrWhiteSpace(l)) yield return l; | |
} | |
} | |
} | |
} | |
public class Models | |
{ | |
//Auto-generated classes for the Yahoo! Answers sitemap | |
[Serializable] | |
[DesignerCategory("code")] | |
[XmlType(AnonymousType = true, Namespace = "http://www.sitemaps.org/schemas/sitemap/0.9")] | |
[XmlRoot(Namespace = "http://www.sitemaps.org/schemas/sitemap/0.9", IsNullable = false)] | |
public partial class sitemapindex | |
{ | |
private sitemapindexSitemap[] sitemapField; | |
[XmlElement("sitemap")] | |
public sitemapindexSitemap[] sitemap { get; set; } | |
} | |
[Serializable] | |
[DesignerCategory("code")] | |
[XmlType(AnonymousType = true, Namespace = "http://www.sitemaps.org/schemas/sitemap/0.9")] | |
public partial class sitemapindexSitemap | |
{ | |
public string loc { get; set; } | |
public System.DateTime lastmod { get; set; } | |
} | |
[Serializable] | |
[DesignerCategory("code")] | |
[XmlType(AnonymousType = true, Namespace = "http://www.sitemaps.org/schemas/sitemap/0.9")] | |
[XmlRoot(Namespace = "http://www.sitemaps.org/schemas/sitemap/0.9", IsNullable = false)] | |
public partial class urlset | |
{ | |
[System.Xml.Serialization.XmlElementAttribute("url")] | |
public urlsetUrl[] url { get; set; } | |
} | |
[Serializable] | |
[DesignerCategory("code")] | |
[XmlType(AnonymousType = true, Namespace = "http://www.sitemaps.org/schemas/sitemap/0.9")] | |
public partial class urlsetUrl | |
{ | |
public string loc { get; set; } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment