Skip to content

Instantly share code, notes, and snippets.

@magicdawn
Created July 23, 2014 14:15
Show Gist options
  • Save magicdawn/feda871d8174f0b9f525 to your computer and use it in GitHub Desktop.
Save magicdawn/feda871d8174f0b9f525 to your computer and use it in GitHub Desktop.
huaban-board-downloader C# Edition
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<appSettings>
<!--是否在debug-->
<add key="Debug" value="false"/>
<!--存放文件夹-->
<add key="ImageDir" value="image"/>
<!--重试次数-->
<add key="MaxTryTimes" value="5"/>
<!--线程数量-->
<add key="ThreadCount" value="5"/>
</appSettings>
</configuration>
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace ImageDownLoader
{
class Config
{
//示例url
public static readonly string ExampleUrl = "http://huaban.com/boards/13715778/";
//是否debug
public static readonly bool Debug;
//文件
public static readonly string ImageDir;
//重试次数
public static readonly int MaxTryTimes;
//错误记录
public static readonly string ErrorLog = "下载失败记录.txt";
//线程数量
public static int ThreadCount; //可通过命令行修改
static string AppConfig(string key)
{
//如果删除config问价,返回null
return System.Configuration.ConfigurationManager.AppSettings[key];
}
static Config()
{
Debug = bool.Parse(AppConfig("Debug") ?? "false");
ImageDir = AppConfig("ImageDir") ?? "image";
MaxTryTimes = int.Parse(AppConfig("MaxTryTimes") ?? "5");
ThreadCount = int.Parse(AppConfig("ThreadCount") ?? "5");
}
}
}
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading;
using System.Diagnostics;
using System.Threading.Tasks;
namespace ImageDownLoader
{
class Program
{
//相当于Task.WaitAll
//不用Task实现,因为实测比new Thread慢,可能线程池什么的有限制吧
static bool CheckAnyRunning(Thread[] threads)
{
foreach(var t in threads)
{
if(t.IsAlive)
{
return true;
}
}
return false;
}
static void WriteWithColor(Action act)
{
var old = Console.ForegroundColor;
act();
Console.ForegroundColor = old;
}
static Program()
{
}
static void Main(string[] args)
{
var watch = new Stopwatch();
watch.Start();
var argUrl = "";
if(Config.Debug)
{
argUrl = Config.ExampleUrl;
}
else if(args.Length > 0)
{
argUrl = args[0];
}
else
{
Console.WriteLine();
Console.WriteLine(" 请指定图片所在网页(如{0})",Config.ExampleUrl);
Console.WriteLine(" 后可跟线程数量,如({0} {1})",Config.ExampleUrl,Config.ThreadCount);
Console.WriteLine();
WriteWithColor(() => {
Console.ForegroundColor = ConsoleColor.Cyan;
Console.WriteLine(" 花瓣网-画板下载-命令行工具 by Magicdawn 2014-7-23");
});
return;
}
/*
* 1.请求地址,
*/
var html = Magicdawn.HttpHelper.Request(argUrl);
var title = HuabanUtil.FindTitle(html);
var username = HuabanUtil.FindUsername(title);
var count = int.Parse(HuabanUtil.FindCount(title));
var width = count.ToString().Length;
var errorPath = "{0}/{1}/{2}".format(Config.ImageDir,title,Config.ErrorLog);
Queue<Tuple<string,string,string>> pins = new Queue<Tuple<string,string,string>>();
/*
* title路径合法 ?
*/
if(title.ContainOneOf("/ \\ : * ? \" < > |"))
{
Console.WriteLine("title中有不合法内容,不能做文件夹名。");
Console.Write("请手动指定 : ");
title = Console.ReadLine();
}
/*
* 文件夹是否存在
*/
Console.WriteLine("系列为 : {0}",title);
Console.WriteLine("画板共 {0} 张图 , 作者为 : {1}",count,username);
if(!Directory.Exists(Config.ImageDir))
Directory.CreateDirectory(Config.ImageDir);
if(Directory.Exists(Config.ImageDir + "/" + title))
{
Console.WriteLine();
Console.Write("你好像下载过了...要重新下?(y/n) : ");
watch.Stop();
if(Console.ReadLine() != "y")
{
return;//退出
}
else
{
//接着下载
File.Delete(errorPath);
watch.Start();
}
}
else
{
Directory.CreateDirectory(Config.ImageDir + "/" + title);
}
/*
* 添加当前页
*/
var page_pins = HuabanUtil.FindPins(html);
foreach(var p in page_pins)
{
pins.Enqueue(p);
}
/*
* 访问后续页
*/
var pageNum = count / 100 + 1;
foreach(var i in Enumerable.Range(0,pageNum))
{
var maxId = pins.Last().Item1; //id,src,ext
var url = "{0}?max={1}&limit=100".format(argUrl,maxId);
html = Magicdawn.HttpHelper.Request(url);
page_pins = HuabanUtil.FindPins(html);
foreach(var p in page_pins)
{
pins.Enqueue(p);
}
}
/*
* 开始下载
*/
var index = 1; //要处理的索引
if(args.Length > 1)
{
// url 线程数量
Config.ThreadCount = int.Parse(args[1]);
}
var threads = new Thread[Config.ThreadCount];
for(int i = 0;i < Config.ThreadCount;i++)
{
threads[i] = new Thread(() => {
Tuple<string,string,string> p;
var client = new System.Net.WebClient();
string curIndex; //当前是第几张图
while(pins.Count > 0)
{
lock(pins)
{
p = pins.Dequeue();
curIndex = index.ToString().PadLeft(width,'0');
index++;
}
/*
* 有pin = (id,src,type)了,找url path ext
* 下载
*/
var src = p.Item2; //()
var ext = p.Item3;
var path = "{0}/{1}/{2}.{3}".format(Config.ImageDir,title,curIndex,ext);
Console.WriteLine("正在下载第{0}张图 : {1}",curIndex,src);
//Console.WriteLine(curIndex + "@" + Thread.CurrentThread.ManagedThreadId);
if(!HuabanUtil.Download(client,src,path))
{
Console.WriteLine("第{0}张图下载失败!",curIndex);
File.AppendAllText(errorPath,
//2014-7-23 20:13:38 第001张 http://xxx
"{0} 第{1}张 {2}".format(
DateTime.Now.ToStringX(), //时间
curIndex, //第几张
src
)
);
}
}
}) { IsBackground = true };
threads[i].Start();
}
while(CheckAnyRunning(threads))
{
Thread.Sleep(1000);
}
//等待其他线程作业
Console.WriteLine("下载完成了...耗时 {0}分{1}秒",watch.Elapsed.Minutes,watch.Elapsed.Seconds);
}
}
}
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
namespace ImageDownLoader
{
public class HuabanUtil
{
static Regex regexTitle = new Regex(@"<title>(?<title>[\s\S]*?)收集");//group['title']
//在title中找
static Regex regexInTitle = new Regex(@"\((?<count>\d+)图\)_@(?<username>[\s\S]*)");
//在pin_string里面找
static Regex regexPinId = new Regex(@"""pin_id""[\s]*?:[\s]*?(?<id>\d+)");
static Regex regexPinBucket = new Regex(@"""bucket"":""(?<bucket>\w+)""");
static Regex regexPinKey = new Regex(@"""key"":""(?<key>[\w_-]+)""");
static Regex regexPinType = new Regex(@"""type"":""image/(?<type>\w+)""");
//找出board的title,username,count
public static string FindTitle(string html)
{
return regexTitle.Match(html).Groups["title"].Value;
}
internal static string FindUsername(string title)
{
return regexInTitle.Match(title).Groups["username"].Value;
}
internal static string FindCount(string title)
{
return regexInTitle.Match(title).Groups["count"].Value;
}
/*
img_host = {
"hbimg": "img.hb.aicdn.com",
"hbfile": "hbfile.b0.upaiyun.com/img/apps"
}
hbfile = {
"hbfile": "hbfile.b0.upaiyun.com",
"hbimg2": "hbimg2.b0.upaiyun.com"
}
*/
//图片服务器
static Dictionary<string,string> imgHost = new Dictionary<string,string>() {
{ "hbimg", "img.hb.aicdn.com" },
{ "hbfile", "hbfile.b0.upaiyun.com/img/apps" }
};
static Dictionary<string,string> hbFile = new Dictionary<string,string>() {
{ "hbfile", "hbfile.b0.upaiyun.com" },
{ "hbimg2", "hbimg2.b0.upaiyun.com" }
};
//返回[(int id,string src,string "image/jpeg")]
internal static IEnumerable<Tuple<string,string,string>> FindPins(string html)
{
var pins_index = html.IndexOf("\"pins\""); //"pins":[{"pin_id
var remain = html.Substring(pins_index + 7); //[{...
var end_index = Magicdawn.Util.StringFinder.GetSecondIndex(remain);
remain = remain.Substring(0,end_index); // [...]
var pins_string = new List<string>();
while(remain.IndexOf('{') > 0)
{
var left = remain.IndexOf('{');
var right = Magicdawn.Util.StringFinder.GetSecondIndex(remain,left);
var content = remain.Substring(left + 1,right - left);//不包括 {}
pins_string.Add(content);
remain = remain.Substring(right);
}
foreach(var p_string in pins_string)
{
var id = regexPinId.Match(p_string).Groups["id"].Value;
var bucket = regexPinBucket.Match(p_string).Groups["bucket"].Value;
var key = regexPinKey.Match(p_string).Groups["key"].Value;
var typeMatch = regexPinType.Match(p_string);
var baseUrl = imgHost[bucket];
var src = string.Format("http://{0}/{1}",baseUrl,key);
var ext = "jpg";
if(typeMatch != null)
{
//type可能匹配不到
ext = GetFileExt(typeMatch.Groups["type"].Value);
}
yield return Tuple.Create(id,src,ext);
}
}
internal static string GetFileExt(string type)
{
//type是image/xxx
type = type.ToLowerInvariant();
if(type == "jpeg" || type == "pjpeg")
return "jpg";
else
return type;
}
public static bool Download(WebClient client,string src,string path,int times = 0)
{
try
{
client.DownloadFile(src,path);
}
catch(WebException)
{
times++;
if(times <= Config.MaxTryTimes)
{
return Download(client,src,path,times); //尝试下一次
}
else
{
return false; //下载失败
}
}
return true; //默认成功
}
}
}
@magicdawn
Copy link
Author

其中的Magicdawn.Util.xxx 见本人的Magicdawn Library那个仓库

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment