Last active
October 21, 2023 12:18
-
-
Save kpcyrd/7342c3f833fbd09e98a765ca8417922e to your computer and use it in GitHub Desktop.
3-2-1 code review html scraping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// updated code for https://github.com/unbeschwert/3-2-1 | |
use std::{fs, io::Write}; | |
use anyhow::{Result, Context}; | |
use dirs::home_dir; | |
use scraper::{Element, Html, Selector}; | |
static URL: &str = "https://jamesclear.com/3-2-1"; | |
const QUESTION_HEADLINE: &str = "1 QUESTION FOR YOU"; | |
const HTML_P_TAG: &str = "p"; | |
fn get_questions(doc: &Html, sel: &Selector) -> Result<Option<String>> { | |
for h2_element in doc.select(sel) { | |
if h2_element.text().collect::<String>() == QUESTION_HEADLINE { | |
let mut p_element = h2_element.next_sibling_element() | |
.with_context(|| format!("Could not process {doc:#?}"))?; | |
let mut questions = String::new(); | |
while p_element.value().name() == HTML_P_TAG { | |
let text = p_element.text().collect::<String>(); | |
if text.starts_with("Until") { | |
break; | |
} else { | |
questions.push_str(&text); | |
questions.push(' '); | |
if let Some(e) = p_element.next_sibling_element() { | |
p_element = e; | |
} else { | |
break; | |
} | |
} | |
} | |
return Ok(Some(questions)); | |
} | |
} | |
Ok(None) | |
} | |
fn fetch(url: &str) -> Result<String> { | |
let text = reqwest::blocking::get(url) | |
.with_context(|| format!("Failed to send http request: {url:?}"))? | |
.error_for_status() | |
.with_context(|| format!("Failed to download: {url:?}"))? | |
.text() | |
.context("Failed to read http response")?; | |
Ok(text) | |
} | |
fn main() -> Result<()> { | |
let file_name = home_dir() | |
.context("Could not determine home directory")? | |
.join("questions"); | |
let mut file_obj = fs::File::create(&file_name) | |
.with_context(|| format!("Unable to create file {file_name:?}"))?; | |
let main_page = fetch(URL)?; | |
let main_html = Html::parse_document(&main_page); | |
let a_selector = Selector::parse(r#"a[class="all-articles__news__post"]"#).unwrap(); | |
for element in main_html.select(&a_selector) { | |
if let Some(newsletter_url) = element.value().attr("href") { | |
let newsletter = fetch(newsletter_url)?; | |
let newsletter_html = Html::parse_document(&newsletter); | |
let h2_selector = Selector::parse("h2").unwrap(); | |
if let Some(mut questions) = get_questions(&newsletter_html, &h2_selector)? { | |
questions.push('\n'); | |
file_obj | |
.write_all(questions.as_bytes()) | |
.context("Unable to write to file")?; | |
} | |
} | |
} | |
Ok(()) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment