Skip to content

Instantly share code, notes, and snippets.

@ioxua
Last active July 30, 2018 01:52
Show Gist options
  • Save ioxua/adcdfdc9d2d03d161dea10d8fbdd9d6a to your computer and use it in GitHub Desktop.
Save ioxua/adcdfdc9d2d03d161dea10d8fbdd9d6a to your computer and use it in GitHub Desktop.
Article fetching engine start
// Abstract Article Model
#[derive(Debug)]
pub enum NodeKind {
// Should I use structs or tuples here?
Root, // Main node
Text(String),
Image {
src: String, alt: String, legend: String
},
Citation {
text: String, author: String
},
Title(String),
Subtitle(String)
}
#[derive(Debug)]
pub struct Node {
kind: NodeKind,
children: Vec<Node>,
parent: Option<&Node>
}
impl Node {
pub fn new(kind: NodeKind) -> Node {
Node {
kind,
children: Vec::new(),
parent: None
}
}
pub fn is_leaf(&self) -> bool {
self.children.len() == 0
}
// I really hope I don't push nodes at children explicitly
pub fn add_node(&mut self, node: Node) {
if let None = node.parent {
// If this node has no children, create one and push it
// Empty Vec<T> doesn't allocate memory, so this is OK
// self.children.push(Node::new(NodeKind::Root));
node.parent = Some<&self>;
}
self.children.push(node);
}
// TODO(ioxua-os): Implement remove() and util methods here
}
#[derive(Debug)]
pub struct Website {
src: String
}
#[derive(Debug)]
pub struct Article {
src: String,
content: Node
}
extern crate reqwest; // 0.8.6
use domain;
pub trait Fetcher {
fn fetch(&self, website:Website) -> Result<Vec<Article>, Error>;
}
// Reqwest docs recommends us to instantiating a single client and reusing it
pub struct HttpFetcher {
client: reqwest::Client
}
impl HttpFetcher {
fn new() -> HttpFetcher {
HttpFetcher {
client: reqwest::Client::new()
}
}
}
impl Fetcher for HttpFetcher {
fn fetch(&self, website: Website) -> Result<Vec<Article>, Error> {
// This next line should be self.client.get(website.src)
// But for some reason, IntoUrl is not implemented for String. yet (?)
let res = self.client
.get(&website.src[..])
.send();
// TODO(ioxua-os): Refactor this error handling.
// We should consider every case.
// See https://docs.rs/reqwest/0.8.6/reqwest/struct.Error.html
if let Err(e) = res {
let mut details = if e.is_http() {
"HTTP-related exception".to_string()
} else if e.is_serialization() {
"Serialization error".to_string()
} else if e.is_redirect() {
"Redirection error".to_string()
} else if e.is_client_error() {
"Client error".to_string()
} else if e.is_server_error() {
"Server error".to_string()
} else {
// Too generic of an error message?
"The request couldn't succeed".to_string()
};
if let Some(code) = e.status() {
details = format!("{}: Response status {}", details, code)
}
let mut msg = format!("Error acquiring {}", url=website.src);
if let Some(url) = e.url() {
msg = format!("Error acquiring {}", url)
}
return Err(Error {
msg,
details
})
}
let mut articles: Vec<Article> = Vec::new();
// Fetch the articles here yaaay
Ok(articles)
}
}
mod aam;
mod domain;
mod utils;
mod fetcher;
#[derive(Debug)]
pub struct Error {
msg: String, details: String
}
/*
The idea here is mapping the configuration into a single .json file in order to
kill the necessity to implement a custom parser for each site we will be scraping.
selectorType - For now we will support CSS only. But nothing keeps us from upgrading
articleSelector - The "base" selector. Each result from this selection MUST resuln
in exactly one article. Every other selector will be executed relative to the
results of this one.
Every other selector must yield exactly one result too.
*/
[
{
"url": "https://medium.freecodecamp.org",
"selectorType": "css",
"articleSelector": ".postArticle",
"linkSelector": "a[href]",
"titleSelector": "postArticle-content h3",
"subtitleSelector": "postArticle-content p"
}
]
extern crate newscrape;
use newscrape::fetcher;
use newscrape::aam; // Abstract Article Model
use newscrape::domain::Website;
fn main() {
// Maintaning the scope restricted for readability
{
// This should be static but I'm dumb
let free_code_camp = Website {
src: String::from("https://medium.freecodecamp.org")
};
let fetcher = fetcher::HttpFetcher::new();
match fetcher.fetch(free_code_camp) {
Ok(res) => {
println!("Download finished {:?}", res)
},
Err(e) => {
println!("Error: {:?}", e)
}
};
}
{
let str = "Lorem ipsum dolor sit amet...".to_string();
let text = aam::NodeKind::Text(str.clone());
let mut node = aam::Node::new(text);
assert_eq!(node.is_leaf(), true);
let node2 = aam::Node::new(aam::NodeKind::Text(str.clone()));
node.children.push(node2);
assert_eq!(node.is_leaf(), false);
assert_eq!(node.children.len(), 1);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment