Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save sdcb/ec34e2f4b6a3a67559053ffbfb69f11d to your computer and use it in GitHub Desktop.
Save sdcb/ec34e2f4b6a3a67559053ffbfb69f11d to your computer and use it in GitHub Desktop.
抓取博客园首页前4000篇文章大体内容,并保存到数据库
using Dapper;
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Configuration;
using System.Data.SqlClient;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace CnblogsPost
{
class Program
{
static string ConnectionString = ConfigurationManager.AppSettings["ConnectionString"];
static void Main(string[] args)
{
var count = 0;
foreach (var x in GetPosts())
{
var successImported = WritePostToDatabase(x);
if (!successImported)
{
LogDone(count);
break;
}
count += 1;
};
}
private static void LogDone(int count)
{
using (var connection = new SqlConnection(ConnectionString))
{
const string sql =
"INSERT INTO [PostLog] " +
" ( [Count] ) " +
"VALUES ( @Count )";
connection.Execute(sql, new
{
Count = count
});
}
}
private static bool WritePostToDatabase(PostContent post)
{
using (var connection = new SqlConnection(ConnectionString))
{
const string sql =
"IF NOT EXISTS (SELECT 1 FROM [Post] WHERE [Url] = @Url)" +
"INSERT INTO [Post]" +
" ( [Title], [Url], [BriefContent], [ReadCount], [CommentCount], [CreateDate], [Author], [AuthorLink])" +
" VALUES(@Title, @Url, @BriefContent, @ReadCount, @CommentCount, @CreateDate, @Author, @AuthorLink)";
var affectedRows = connection.Execute(sql, post);
return affectedRows > 0;
}
}
private static IEnumerable<PostContent> GetPosts()
{
var web = new HtmlWeb();
for (var page = 1; ; ++page)
{
var doc = web.Load($"http://www.cnblogs.com/sitehome/p/{page}");
var nodes = doc.DocumentNode.SelectNodes(@"//*[@id=""post_list""]/div[*]");
foreach (var node in nodes)
{
var braceNumber = new Regex(@".+\((\d+)\)", RegexOptions.Compiled);
var dateRegex = new Regex(@"(\d{4}\-\d{2}\-\d{2} \d{2}:\d{2})", RegexOptions.Compiled);
var link = node.SelectSingleNode(@"div[2]/h3/a");
var content = node.SelectSingleNode(@"div[2]/p");
var readNode = node.SelectSingleNode(@"div[2]/div/span[2]/a");
var commentNode = node.SelectSingleNode("div[2]/div/span[1]/a");
var dateNode = node.SelectSingleNode(@"div[2]/div");
var author = node.SelectSingleNode(@"div[2]/div/a");
yield return new PostContent
{
Title = link.InnerText,
BriefContent = content.InnerText,
Url = link.Attributes["href"].Value,
ReadCount = int.Parse(braceNumber.Replace(readNode.InnerText, "$1")),
CommentCount = int.Parse(braceNumber.Replace(commentNode.InnerText, "$1")),
CreateDate = DateTime.Parse(dateRegex.Match(dateNode.InnerText).Groups[1].Value),
Author = author.InnerText,
AuthorLink = author.Attributes["href"].Value
};
}
}
}
}
public class PostContent
{
public string Title { get; set; }
public string BriefContent { get; set; }
public string Url { get; set; }
public int ReadCount { get; set; }
public int CommentCount { get; set; }
public DateTime CreateDate { get; set; }
public string Author { get; set; }
public string AuthorLink { get; set; }
}
}
/****** Object: Table [dbo].[Post] Script Date: 2016/4/15 1:04:41 ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[Post](
[Id] [int] IDENTITY(1,1) NOT NULL,
[Title] [nvarchar](max) NOT NULL,
[Url] [nvarchar](450) NOT NULL,
[BriefContent] [nvarchar](max) NOT NULL,
[ReadCount] [int] NOT NULL,
[CommentCount] [int] NOT NULL,
[CreateDate] [datetime2](7) NOT NULL,
[Author] [nvarchar](max) NOT NULL,
[AuthorLink] [nvarchar](max) NOT NULL,
CONSTRAINT [PK__Post__3214EC07B57B9DE1] PRIMARY KEY CLUSTERED
(
[Id] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
GO
/****** Object: Table [dbo].[PostLog] Script Date: 2016/4/15 1:04:41 ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[PostLog](
[Id] [int] IDENTITY(1,1) NOT NULL,
[Time] [datetime2](7) NOT NULL,
[Count] [int] NOT NULL,
CONSTRAINT [PK_PostLog] PRIMARY KEY CLUSTERED
(
[Id] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY]
GO
SET ANSI_PADDING ON
GO
/****** Object: Index [IX_Post] Script Date: 2016/4/15 1:04:41 ******/
CREATE UNIQUE NONCLUSTERED INDEX [IX_Post] ON [dbo].[Post]
(
[Url] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, SORT_IN_TEMPDB = OFF, IGNORE_DUP_KEY = OFF, DROP_EXISTING = OFF, ONLINE = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
GO
/****** Object: Index [IX_Post_1] Script Date: 2016/4/15 1:04:41 ******/
CREATE NONCLUSTERED INDEX [IX_Post_1] ON [dbo].[Post]
(
[CreateDate] DESC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, SORT_IN_TEMPDB = OFF, DROP_EXISTING = OFF, ONLINE = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
GO
/****** Object: Index [NonClusteredIndex-20160415-010153] Script Date: 2016/4/15 1:04:41 ******/
CREATE NONCLUSTERED INDEX [NonClusteredIndex-20160415-010153] ON [dbo].[PostLog]
(
[Time] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, SORT_IN_TEMPDB = OFF, DROP_EXISTING = OFF, ONLINE = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
GO
/****** Object: Index [NonClusteredIndex-20160415-010236] Script Date: 2016/4/15 1:04:41 ******/
CREATE NONCLUSTERED INDEX [NonClusteredIndex-20160415-010236] ON [dbo].[PostLog]
(
[Count] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, SORT_IN_TEMPDB = OFF, DROP_EXISTING = OFF, ONLINE = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
GO
ALTER TABLE [dbo].[PostLog] ADD CONSTRAINT [DF_PostLog_Time] DEFAULT (sysdatetime()) FOR [Time]
GO
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment