Last active
July 30, 2018 01:52
-
-
Save ioxua/adcdfdc9d2d03d161dea10d8fbdd9d6a to your computer and use it in GitHub Desktop.
Article fetching engine start
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Abstract Article Model | |
#[derive(Debug)] | |
pub enum NodeKind { | |
// Should I use structs or tuples here? | |
Root, // Main node | |
Text(String), | |
Image { | |
src: String, alt: String, legend: String | |
}, | |
Citation { | |
text: String, author: String | |
}, | |
Title(String), | |
Subtitle(String) | |
} | |
#[derive(Debug)] | |
pub struct Node { | |
kind: NodeKind, | |
children: Vec<Node>, | |
parent: Option<&Node> | |
} | |
impl Node { | |
pub fn new(kind: NodeKind) -> Node { | |
Node { | |
kind, | |
children: Vec::new(), | |
parent: None | |
} | |
} | |
pub fn is_leaf(&self) -> bool { | |
self.children.len() == 0 | |
} | |
// I really hope I don't push nodes at children explicitly | |
pub fn add_node(&mut self, node: Node) { | |
if let None = node.parent { | |
// If this node has no children, create one and push it | |
// Empty Vec<T> doesn't allocate memory, so this is OK | |
// self.children.push(Node::new(NodeKind::Root)); | |
node.parent = Some<&self>; | |
} | |
self.children.push(node); | |
} | |
// TODO(ioxua-os): Implement remove() and util methods here | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#[derive(Debug)] | |
pub struct Website { | |
src: String | |
} | |
#[derive(Debug)] | |
pub struct Article { | |
src: String, | |
content: Node | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
extern crate reqwest; // 0.8.6 | |
use domain; | |
pub trait Fetcher { | |
fn fetch(&self, website:Website) -> Result<Vec<Article>, Error>; | |
} | |
// Reqwest docs recommends us to instantiating a single client and reusing it | |
pub struct HttpFetcher { | |
client: reqwest::Client | |
} | |
impl HttpFetcher { | |
fn new() -> HttpFetcher { | |
HttpFetcher { | |
client: reqwest::Client::new() | |
} | |
} | |
} | |
impl Fetcher for HttpFetcher { | |
fn fetch(&self, website: Website) -> Result<Vec<Article>, Error> { | |
// This next line should be self.client.get(website.src) | |
// But for some reason, IntoUrl is not implemented for String. yet (?) | |
let res = self.client | |
.get(&website.src[..]) | |
.send(); | |
// TODO(ioxua-os): Refactor this error handling. | |
// We should consider every case. | |
// See https://docs.rs/reqwest/0.8.6/reqwest/struct.Error.html | |
if let Err(e) = res { | |
let mut details = if e.is_http() { | |
"HTTP-related exception".to_string() | |
} else if e.is_serialization() { | |
"Serialization error".to_string() | |
} else if e.is_redirect() { | |
"Redirection error".to_string() | |
} else if e.is_client_error() { | |
"Client error".to_string() | |
} else if e.is_server_error() { | |
"Server error".to_string() | |
} else { | |
// Too generic of an error message? | |
"The request couldn't succeed".to_string() | |
}; | |
if let Some(code) = e.status() { | |
details = format!("{}: Response status {}", details, code) | |
} | |
let mut msg = format!("Error acquiring {}", url=website.src); | |
if let Some(url) = e.url() { | |
msg = format!("Error acquiring {}", url) | |
} | |
return Err(Error { | |
msg, | |
details | |
}) | |
} | |
let mut articles: Vec<Article> = Vec::new(); | |
// Fetch the articles here yaaay | |
Ok(articles) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mod aam; | |
mod domain; | |
mod utils; | |
mod fetcher; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#[derive(Debug)] | |
pub struct Error { | |
msg: String, details: String | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
The idea here is mapping the configuration into a single .json file in order to | |
kill the necessity to implement a custom parser for each site we will be scraping. | |
selectorType - For now we will support CSS only. But nothing keeps us from upgrading | |
articleSelector - The "base" selector. Each result from this selection MUST resuln | |
in exactly one article. Every other selector will be executed relative to the | |
results of this one. | |
Every other selector must yield exactly one result too. | |
*/ | |
[ | |
{ | |
"url": "https://medium.freecodecamp.org", | |
"selectorType": "css", | |
"articleSelector": ".postArticle", | |
"linkSelector": "a[href]", | |
"titleSelector": "postArticle-content h3", | |
"subtitleSelector": "postArticle-content p" | |
} | |
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
extern crate newscrape; | |
use newscrape::fetcher; | |
use newscrape::aam; // Abstract Article Model | |
use newscrape::domain::Website; | |
fn main() { | |
// Maintaning the scope restricted for readability | |
{ | |
// This should be static but I'm dumb | |
let free_code_camp = Website { | |
src: String::from("https://medium.freecodecamp.org") | |
}; | |
let fetcher = fetcher::HttpFetcher::new(); | |
match fetcher.fetch(free_code_camp) { | |
Ok(res) => { | |
println!("Download finished {:?}", res) | |
}, | |
Err(e) => { | |
println!("Error: {:?}", e) | |
} | |
}; | |
} | |
{ | |
let str = "Lorem ipsum dolor sit amet...".to_string(); | |
let text = aam::NodeKind::Text(str.clone()); | |
let mut node = aam::Node::new(text); | |
assert_eq!(node.is_leaf(), true); | |
let node2 = aam::Node::new(aam::NodeKind::Text(str.clone())); | |
node.children.push(node2); | |
assert_eq!(node.is_leaf(), false); | |
assert_eq!(node.children.len(), 1); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment