Skip to content

Instantly share code, notes, and snippets.

@kamijin-fanta kamijin-fanta/README.md
Last active Dec 2, 2015

Embed
What would you like to do?
C#でスクレイピングつらい

C#でスクレイピングつらい

OIC ITCreate Club Advent Calendar 2015 1日目ってことにしておいてください。

HTTPクライアント周りあまり触ったことなかったから触ってみたけどつらい。 これだけ書くのにハマったりしながら3時間位かかった…

試しにPixivのブックマークの1ページ目のイラストを保存するものを書いてみた

using CsQuery;
using EasyHttp.Http;
using System;
using System.Linq;
using System.Net;
using System.Threading;
using System.Web;
class Program
{
static void Main(string[] args)
{
var loginParm = new
{
mode = "login",
return_to = "/",
pixiv_id = "****",
pass = "****",
skip = "1"
};
var client = new HttpClient();
client.Request.Referer = "http://www.pixiv.net/";
client.Request.AllowAutoRedirect = false; // リダイレクト前に付与されたCookieが取得できなくなる問題があるので使用
client.BrowserLike().Get("http://www.pixiv.net/");
client.BrowserLike().Post("https://www.secure.pixiv.net/login.php", UrlEncode(loginParm), HttpContentTypes.ApplicationXWwwFormUrlEncoded);
Thread.Sleep(2000);
var bookmarkRes = client.BrowserLike().Get("http://www.pixiv.net/bookmark.php");
var bookmark = CQ.CreateDocument(bookmarkRes.RawText);
var bList = bookmark.Find(".display_editable_works .image-item a.work")
.Select(s => s.Attributes["href"])
.ToList();
bList
.Select(s => {
var url = "http://www.pixiv.net/" + s;
return new
{
doc = CQ.CreateDocument(client.BrowserLike().Get(url).RawText),
link = url,
};
})
.ToList().ForEach(s => {
var imageUrl = s.doc.Find("._illust_modal img").Attr("data-src");
if (imageUrl == null)
return; // 漫画
client.Request.Referer = s.link;
var path = @"C:\pixiv\" + s.doc.Find(".work-info .title").Text() + imageUrl.Substring(imageUrl.Length - 4);
client.GetAsFile(imageUrl, path);
Thread.Sleep(1000);
});
}
static string UrlEncode(object obj)
{
var list = obj.GetType().GetProperties()
.Where(w => w.GetValue(obj, null) != null)
.Select(s => s.Name + "=" + HttpUtility.UrlEncode(s.GetValue(obj, null).ToString()));
return String.Join("&", list.ToArray());
}
}
static class HttpClientExtension
{
public static HttpClient BrowserLike(this HttpClient client)
{
client.Request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36";
client.Request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
client.Request.AcceptLanguage = "ja,en-US;q=0.8,en;q=0.6";
if (client.Request?.Cookies == null)
client.Request.Cookies = new System.Net.CookieCollection();
if (client.Response?.Cookies != null)
foreach (var cookie in client.Response.Cookies)
client.Request.Cookies.Add(cookie as Cookie);
return client;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.