Skip to content

Instantly share code, notes, and snippets.

@kpcyrd
Last active October 21, 2023 12:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kpcyrd/7342c3f833fbd09e98a765ca8417922e to your computer and use it in GitHub Desktop.
Save kpcyrd/7342c3f833fbd09e98a765ca8417922e to your computer and use it in GitHub Desktop.
3-2-1 code review html scraping
// updated code for https://github.com/unbeschwert/3-2-1
use std::{fs, io::Write};
use anyhow::{Result, Context};
use dirs::home_dir;
use scraper::{Element, Html, Selector};
static URL: &str = "https://jamesclear.com/3-2-1";
const QUESTION_HEADLINE: &str = "1 QUESTION FOR YOU";
const HTML_P_TAG: &str = "p";
fn get_questions(doc: &Html, sel: &Selector) -> Result<Option<String>> {
for h2_element in doc.select(sel) {
if h2_element.text().collect::<String>() == QUESTION_HEADLINE {
let mut p_element = h2_element.next_sibling_element()
.with_context(|| format!("Could not process {doc:#?}"))?;
let mut questions = String::new();
while p_element.value().name() == HTML_P_TAG {
let text = p_element.text().collect::<String>();
if text.starts_with("Until") {
break;
} else {
questions.push_str(&text);
questions.push(' ');
if let Some(e) = p_element.next_sibling_element() {
p_element = e;
} else {
break;
}
}
}
return Ok(Some(questions));
}
}
Ok(None)
}
fn fetch(url: &str) -> Result<String> {
let text = reqwest::blocking::get(url)
.with_context(|| format!("Failed to send http request: {url:?}"))?
.error_for_status()
.with_context(|| format!("Failed to download: {url:?}"))?
.text()
.context("Failed to read http response")?;
Ok(text)
}
fn main() -> Result<()> {
let file_name = home_dir()
.context("Could not determine home directory")?
.join("questions");
let mut file_obj = fs::File::create(&file_name)
.with_context(|| format!("Unable to create file {file_name:?}"))?;
let main_page = fetch(URL)?;
let main_html = Html::parse_document(&main_page);
let a_selector = Selector::parse(r#"a[class="all-articles__news__post"]"#).unwrap();
for element in main_html.select(&a_selector) {
if let Some(newsletter_url) = element.value().attr("href") {
let newsletter = fetch(newsletter_url)?;
let newsletter_html = Html::parse_document(&newsletter);
let h2_selector = Selector::parse("h2").unwrap();
if let Some(mut questions) = get_questions(&newsletter_html, &h2_selector)? {
questions.push('\n');
file_obj
.write_all(questions.as_bytes())
.context("Unable to write to file")?;
}
}
}
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment