Skip to content

Instantly share code, notes, and snippets.

@theolivenbaum
Created April 10, 2021 14:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save theolivenbaum/f92227a091d475e6325a4ef9f6faebf2 to your computer and use it in GitHub Desktop.
Save theolivenbaum/f92227a091d475e6325a4ef9f6faebf2 to your computer and use it in GitHub Desktop.
Yahoo! Answers Sitemap crawler
using System;
using System.Collections.Generic;
using System.IO;
using System.Net.Http;
using System.Threading.Tasks;
using System.Xml.Serialization;
using System.Linq;
using System.IO.Compression;
using System.Threading;
using System.Diagnostics;
using System.ComponentModel;
var serializerMap = new XmlSerializer(typeof(Model.sitemapindex));
var serializerUrls = new XmlSerializer(typeof(Model.urlset));
var dataPath = @"/yahoo-answers-dataset/";
var mapsPath = Path.Combine(dataPath, "maps"); Directory.CreateDirectory(mapsPath);
var dumpPath = Path.Combine(dataPath, "dump"); Directory.CreateDirectory(dumpPath);
using var client = new HttpClient();
var sitemap = await client.GetStreamAsync("https://answers.yahoo.com/sitemaps/sitemap-us.xml");
var map = serializerMap.Deserialize(sitemap) as Model.sitemapindex;
var tasks = new List<Task>();
foreach (var sm in map.sitemap)
{
var submap = sm;
tasks.Add(Task.Run(async () =>
{
try
{
var submapName = Path.GetFileName(submap.loc);
var file = Path.Combine(mapsPath, submapName) + ".txt";
if (File.Exists(file)) return;
var submapStream = await client.GetStreamAsync(submap.loc);
using var gz = new GZipStream(submapStream, CompressionMode.Decompress);
var answers = serializerUrls.Deserialize(gz) as Model.urlset;
await File.WriteAllLinesAsync(file, answers.url.Select(a => a.loc));
Console.WriteLine($"Read map {submap.loc}");
}
catch
{
Console.WriteLine($"Fail to read map {submap.loc}");
}
}));
if (tasks.Count > 10)
{
await Task.WhenAny(tasks); tasks.RemoveAll(t => t.IsCompleted);
}
}
await Task.WhenAll(tasks);
tasks.Clear();
long count = 0; long total = EnumerateMaps(mapsPath).Count();
var sw = Stopwatch.StartNew();
foreach(var u in EnumerateMaps(mapsPath))
{
var url = u;
tasks.Add(Task.Run(async () =>
{
var c = Interlocked.Increment(ref count);
var qid = url.Split("=")[1].Split("<")[0];
var year = qid.Substring(0, 4);
var month = qid.Substring(4, 2);
var day = qid.Substring(6, 2);
var targetPath = Path.Combine(dumpPath, year,month,day);
var targetFile = Path.Combine(targetPath, qid + ".html");
Directory.CreateDirectory(targetPath);
if (File.Exists(targetFile)) return;
var s = await client.GetStreamAsync(url);
using var f = File.OpenWrite(targetFile);
await s.CopyToAsync(f);
if (c % 1000 == 0)
{
Console.WriteLine($"Read {c:n0} of {total:n0}, at {100f*c/total:n1}% - last read {year}/{month}/{day} qid={qid} at {1000 / sw.Elapsed.TotalSeconds:n1} answers/s");
sw.Restart();
}
}));
if(tasks.Count > 64)
{
await Task.WhenAny(tasks); tasks.RemoveAll(t => t.IsCompleted);
}
}
await Task.WhenAll(tasks);
private static IEnumerable<string> EnumerateMaps(string mapsPath)
{
foreach(var file in Directory.EnumerateFiles(mapsPath))
{
using (var f = File.OpenRead(file))
using (var r = new StreamReader(f))
{
while (!r.EndOfStream)
{
var l = r.ReadLine();
if (!string.IsNullOrWhiteSpace(l)) yield return l;
}
}
}
}
public class Models
{
//Auto-generated classes for the Yahoo! Answers sitemap
[Serializable]
[DesignerCategory("code")]
[XmlType(AnonymousType = true, Namespace = "http://www.sitemaps.org/schemas/sitemap/0.9")]
[XmlRoot(Namespace = "http://www.sitemaps.org/schemas/sitemap/0.9", IsNullable = false)]
public partial class sitemapindex
{
private sitemapindexSitemap[] sitemapField;
[XmlElement("sitemap")]
public sitemapindexSitemap[] sitemap { get; set; }
}
[Serializable]
[DesignerCategory("code")]
[XmlType(AnonymousType = true, Namespace = "http://www.sitemaps.org/schemas/sitemap/0.9")]
public partial class sitemapindexSitemap
{
public string loc { get; set; }
public System.DateTime lastmod { get; set; }
}
[Serializable]
[DesignerCategory("code")]
[XmlType(AnonymousType = true, Namespace = "http://www.sitemaps.org/schemas/sitemap/0.9")]
[XmlRoot(Namespace = "http://www.sitemaps.org/schemas/sitemap/0.9", IsNullable = false)]
public partial class urlset
{
[System.Xml.Serialization.XmlElementAttribute("url")]
public urlsetUrl[] url { get; set; }
}
[Serializable]
[DesignerCategory("code")]
[XmlType(AnonymousType = true, Namespace = "http://www.sitemaps.org/schemas/sitemap/0.9")]
public partial class urlsetUrl
{
public string loc { get; set; }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment