Created
June 15, 2021 04:05
-
-
Save Igouist/ebfc29be9e350bb7c289f05df694535b to your computer and use it in GitHub Desktop.
稽查目標 PTT 用戶的留言,使用 AngleSharp + pttweb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
async Task Main() | |
{ | |
var pttUser = ""; | |
var page = 1; | |
var crawlar = new PttMessageCrawler(); | |
var result = await crawlar.GetMessages(pttUser, page); | |
result.Dump(); | |
} | |
/// <summary> | |
/// 爬蟲服務 | |
/// </summary> | |
public class PttMessageCrawler | |
{ | |
private readonly IBrowsingContext _browser; | |
/// <summary> | |
/// 初始化 | |
/// </summary> | |
public PttMessageCrawler() | |
{ | |
var config = Configuration.Default.WithDefaultLoader(); | |
var browser = BrowsingContext.New(config); | |
this._browser = browser; | |
} | |
/// <summary> | |
/// 取得留言內容 | |
/// </summary> | |
/// <returns></returns> | |
public async Task<IEnumerable<PttMessage>> GetMessages( | |
string username, | |
int page = 0) | |
{ | |
var pttUrl = $"https://www.pttweb.cc/user/{username}?t=message&page={page}"; | |
var document = await this._browser.OpenAsync(pttUrl); | |
if (document is null) return null; | |
var listQuery = "div.thread-item"; | |
var titleQuery = "span.thread-title"; | |
var messageQuery = "span.yellow--text.text--darken-2"; | |
var contents = document.QuerySelectorAll(listQuery); | |
document.Close(); | |
var messages = contents.Select(content => new PttMessage | |
{ | |
Title = content | |
.QuerySelector(titleQuery) | |
.TextContent, | |
Content = content | |
.QuerySelectorAll(messageQuery) | |
.Select(x => x.TextContent.Replace(": ", "")) | |
.Where(x => x != username) | |
}); | |
return messages; | |
} | |
} | |
public class PttMessage | |
{ | |
public string Title { get; set; } | |
public IEnumerable<string> Content { get; set; } | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
666