Skip to content

Instantly share code, notes, and snippets.

@kekyo
Created April 17, 2018 00:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kekyo/c093d982dc5040ba8ec4b6c58f4821f9 to your computer and use it in GitHub Desktop.
Save kekyo/c093d982dc5040ba8ec4b6c58f4821f9 to your computer and use it in GitHub Desktop.
イラク復興支援群の日報 370日分 - 朝日新聞
using System;
using System.IO;
using System.Linq;
using System.Net.Http;
using System.Threading.Tasks;
using AngleSharp.Dom.Html;
using AngleSharp.Parser.Html;
namespace ConsoleApp1
{
public static class Program
{
private static async Task<IHtmlDocument> GetHtmlDocumentAsync(Uri url)
{
var httpClient = new HttpClient();
using (var hs = await httpClient.GetStreamAsync(url))
{
var parser = new HtmlParser();
return await parser.ParseAsync(hs);
}
}
private static async Task DownloadPdfDocumentAsync(Uri url, Uri referrer, string basePath)
{
var httpClient = new HttpClient();
httpClient.DefaultRequestHeaders.Referrer = referrer;
using (var hs = await httpClient.GetStreamAsync(url))
{
var path = Path.Combine(basePath, url.PathAndQuery.Split('/').Last());
using (var ts = File.Create(path))
{
await hs.CopyToAsync(ts);
await ts.FlushAsync();
}
}
}
private static async Task MainAsync(string[] args)
{
var target = new Uri("https://www.asahi.com/articles/ASL4J669JL4JUEHF016.html", UriKind.RelativeOrAbsolute);
var document = await GetHtmlDocumentAsync(target);
var hrefs =
(from a in document.Links
let href = a.GetAttribute("href")
where (string.IsNullOrWhiteSpace(href) == false) && (href.EndsWith(".pdf"))
select new Uri(href, UriKind.RelativeOrAbsolute))
.Distinct()
.ToArray();
var basePath = "pdf";
if (!Directory.Exists(basePath))
{
Directory.CreateDirectory(basePath);
}
await Task.WhenAll(hrefs.Select(href => DownloadPdfDocumentAsync(href, target, basePath)));
}
static void Main(string[] args)
{
MainAsync(args).Wait();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment