Skip to content

Instantly share code, notes, and snippets.

@kamijin-fanta
Last active December 2, 2015 04:38
Show Gist options
  • Save kamijin-fanta/c0d9d023bcbad052b7f6 to your computer and use it in GitHub Desktop.
Save kamijin-fanta/c0d9d023bcbad052b7f6 to your computer and use it in GitHub Desktop.
C#でスクレイピングつらい

C#でスクレイピングつらい

OIC ITCreate Club Advent Calendar 2015 1日目ってことにしておいてください。

HTTPクライアント周りあまり触ったことなかったから触ってみたけどつらい。 これだけ書くのにハマったりしながら3時間位かかった…

試しにPixivのブックマークの1ページ目のイラストを保存するものを書いてみた

using CsQuery;
using EasyHttp.Http;
using System;
using System.Linq;
using System.Net;
using System.Threading;
using System.Web;
class Program
{
static void Main(string[] args)
{
var loginParm = new
{
mode = "login",
return_to = "/",
pixiv_id = "****",
pass = "****",
skip = "1"
};
var client = new HttpClient();
client.Request.Referer = "http://www.pixiv.net/";
client.Request.AllowAutoRedirect = false; // リダイレクト前に付与されたCookieが取得できなくなる問題があるので使用
client.BrowserLike().Get("http://www.pixiv.net/");
client.BrowserLike().Post("https://www.secure.pixiv.net/login.php", UrlEncode(loginParm), HttpContentTypes.ApplicationXWwwFormUrlEncoded);
Thread.Sleep(2000);
var bookmarkRes = client.BrowserLike().Get("http://www.pixiv.net/bookmark.php");
var bookmark = CQ.CreateDocument(bookmarkRes.RawText);
var bList = bookmark.Find(".display_editable_works .image-item a.work")
.Select(s => s.Attributes["href"])
.ToList();
bList
.Select(s => {
var url = "http://www.pixiv.net/" + s;
return new
{
doc = CQ.CreateDocument(client.BrowserLike().Get(url).RawText),
link = url,
};
})
.ToList().ForEach(s => {
var imageUrl = s.doc.Find("._illust_modal img").Attr("data-src");
if (imageUrl == null)
return; // 漫画
client.Request.Referer = s.link;
var path = @"C:\pixiv\" + s.doc.Find(".work-info .title").Text() + imageUrl.Substring(imageUrl.Length - 4);
client.GetAsFile(imageUrl, path);
Thread.Sleep(1000);
});
}
static string UrlEncode(object obj)
{
var list = obj.GetType().GetProperties()
.Where(w => w.GetValue(obj, null) != null)
.Select(s => s.Name + "=" + HttpUtility.UrlEncode(s.GetValue(obj, null).ToString()));
return String.Join("&", list.ToArray());
}
}
static class HttpClientExtension
{
public static HttpClient BrowserLike(this HttpClient client)
{
client.Request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36";
client.Request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
client.Request.AcceptLanguage = "ja,en-US;q=0.8,en;q=0.6";
if (client.Request?.Cookies == null)
client.Request.Cookies = new System.Net.CookieCollection();
if (client.Response?.Cookies != null)
foreach (var cookie in client.Response.Cookies)
client.Request.Cookies.Add(cookie as Cookie);
return client;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment