Last active
April 14, 2016 17:05
-
-
Save sdcb/ec34e2f4b6a3a67559053ffbfb69f11d to your computer and use it in GitHub Desktop.
抓取博客园首页前4000篇文章大体内容,并保存到数据库
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Dapper; | |
using HtmlAgilityPack; | |
using System; | |
using System.Collections.Generic; | |
using System.Configuration; | |
using System.Data.SqlClient; | |
using System.Linq; | |
using System.Text; | |
using System.Text.RegularExpressions; | |
using System.Threading.Tasks; | |
namespace CnblogsPost | |
{ | |
class Program | |
{ | |
static string ConnectionString = ConfigurationManager.AppSettings["ConnectionString"]; | |
static void Main(string[] args) | |
{ | |
var count = 0; | |
foreach (var x in GetPosts()) | |
{ | |
var successImported = WritePostToDatabase(x); | |
if (!successImported) | |
{ | |
LogDone(count); | |
break; | |
} | |
count += 1; | |
}; | |
} | |
private static void LogDone(int count) | |
{ | |
using (var connection = new SqlConnection(ConnectionString)) | |
{ | |
const string sql = | |
"INSERT INTO [PostLog] " + | |
" ( [Count] ) " + | |
"VALUES ( @Count )"; | |
connection.Execute(sql, new | |
{ | |
Count = count | |
}); | |
} | |
} | |
private static bool WritePostToDatabase(PostContent post) | |
{ | |
using (var connection = new SqlConnection(ConnectionString)) | |
{ | |
const string sql = | |
"IF NOT EXISTS (SELECT 1 FROM [Post] WHERE [Url] = @Url)" + | |
"INSERT INTO [Post]" + | |
" ( [Title], [Url], [BriefContent], [ReadCount], [CommentCount], [CreateDate], [Author], [AuthorLink])" + | |
" VALUES(@Title, @Url, @BriefContent, @ReadCount, @CommentCount, @CreateDate, @Author, @AuthorLink)"; | |
var affectedRows = connection.Execute(sql, post); | |
return affectedRows > 0; | |
} | |
} | |
private static IEnumerable<PostContent> GetPosts() | |
{ | |
var web = new HtmlWeb(); | |
for (var page = 1; ; ++page) | |
{ | |
var doc = web.Load($"http://www.cnblogs.com/sitehome/p/{page}"); | |
var nodes = doc.DocumentNode.SelectNodes(@"//*[@id=""post_list""]/div[*]"); | |
foreach (var node in nodes) | |
{ | |
var braceNumber = new Regex(@".+\((\d+)\)", RegexOptions.Compiled); | |
var dateRegex = new Regex(@"(\d{4}\-\d{2}\-\d{2} \d{2}:\d{2})", RegexOptions.Compiled); | |
var link = node.SelectSingleNode(@"div[2]/h3/a"); | |
var content = node.SelectSingleNode(@"div[2]/p"); | |
var readNode = node.SelectSingleNode(@"div[2]/div/span[2]/a"); | |
var commentNode = node.SelectSingleNode("div[2]/div/span[1]/a"); | |
var dateNode = node.SelectSingleNode(@"div[2]/div"); | |
var author = node.SelectSingleNode(@"div[2]/div/a"); | |
yield return new PostContent | |
{ | |
Title = link.InnerText, | |
BriefContent = content.InnerText, | |
Url = link.Attributes["href"].Value, | |
ReadCount = int.Parse(braceNumber.Replace(readNode.InnerText, "$1")), | |
CommentCount = int.Parse(braceNumber.Replace(commentNode.InnerText, "$1")), | |
CreateDate = DateTime.Parse(dateRegex.Match(dateNode.InnerText).Groups[1].Value), | |
Author = author.InnerText, | |
AuthorLink = author.Attributes["href"].Value | |
}; | |
} | |
} | |
} | |
} | |
public class PostContent | |
{ | |
public string Title { get; set; } | |
public string BriefContent { get; set; } | |
public string Url { get; set; } | |
public int ReadCount { get; set; } | |
public int CommentCount { get; set; } | |
public DateTime CreateDate { get; set; } | |
public string Author { get; set; } | |
public string AuthorLink { get; set; } | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/****** Object: Table [dbo].[Post] Script Date: 2016/4/15 1:04:41 ******/ | |
SET ANSI_NULLS ON | |
GO | |
SET QUOTED_IDENTIFIER ON | |
GO | |
CREATE TABLE [dbo].[Post]( | |
[Id] [int] IDENTITY(1,1) NOT NULL, | |
[Title] [nvarchar](max) NOT NULL, | |
[Url] [nvarchar](450) NOT NULL, | |
[BriefContent] [nvarchar](max) NOT NULL, | |
[ReadCount] [int] NOT NULL, | |
[CommentCount] [int] NOT NULL, | |
[CreateDate] [datetime2](7) NOT NULL, | |
[Author] [nvarchar](max) NOT NULL, | |
[AuthorLink] [nvarchar](max) NOT NULL, | |
CONSTRAINT [PK__Post__3214EC07B57B9DE1] PRIMARY KEY CLUSTERED | |
( | |
[Id] ASC | |
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY] | |
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY] | |
GO | |
/****** Object: Table [dbo].[PostLog] Script Date: 2016/4/15 1:04:41 ******/ | |
SET ANSI_NULLS ON | |
GO | |
SET QUOTED_IDENTIFIER ON | |
GO | |
CREATE TABLE [dbo].[PostLog]( | |
[Id] [int] IDENTITY(1,1) NOT NULL, | |
[Time] [datetime2](7) NOT NULL, | |
[Count] [int] NOT NULL, | |
CONSTRAINT [PK_PostLog] PRIMARY KEY CLUSTERED | |
( | |
[Id] ASC | |
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY] | |
) ON [PRIMARY] | |
GO | |
SET ANSI_PADDING ON | |
GO | |
/****** Object: Index [IX_Post] Script Date: 2016/4/15 1:04:41 ******/ | |
CREATE UNIQUE NONCLUSTERED INDEX [IX_Post] ON [dbo].[Post] | |
( | |
[Url] ASC | |
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, SORT_IN_TEMPDB = OFF, IGNORE_DUP_KEY = OFF, DROP_EXISTING = OFF, ONLINE = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY] | |
GO | |
/****** Object: Index [IX_Post_1] Script Date: 2016/4/15 1:04:41 ******/ | |
CREATE NONCLUSTERED INDEX [IX_Post_1] ON [dbo].[Post] | |
( | |
[CreateDate] DESC | |
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, SORT_IN_TEMPDB = OFF, DROP_EXISTING = OFF, ONLINE = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY] | |
GO | |
/****** Object: Index [NonClusteredIndex-20160415-010153] Script Date: 2016/4/15 1:04:41 ******/ | |
CREATE NONCLUSTERED INDEX [NonClusteredIndex-20160415-010153] ON [dbo].[PostLog] | |
( | |
[Time] ASC | |
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, SORT_IN_TEMPDB = OFF, DROP_EXISTING = OFF, ONLINE = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY] | |
GO | |
/****** Object: Index [NonClusteredIndex-20160415-010236] Script Date: 2016/4/15 1:04:41 ******/ | |
CREATE NONCLUSTERED INDEX [NonClusteredIndex-20160415-010236] ON [dbo].[PostLog] | |
( | |
[Count] ASC | |
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, SORT_IN_TEMPDB = OFF, DROP_EXISTING = OFF, ONLINE = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY] | |
GO | |
ALTER TABLE [dbo].[PostLog] ADD CONSTRAINT [DF_PostLog_Time] DEFAULT (sysdatetime()) FOR [Time] | |
GO |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment