ioxua/lib_aam.rs

## lib_aam.rs
// Abstract Article Model
#[derive(Debug)]
pub enum NodeKind {
    // Should I use structs or tuples here?
    Root, // Main node
    Text(String),
    Image {
        src: String, alt: String, legend: String
    },
    Citation {
        text: String, author: String
    },
    Title(String),
    Subtitle(String)
}

#[derive(Debug)]
pub struct Node {
    kind: NodeKind,
    children: Vec<Node>,
    parent: Option<&Node>
}

impl Node {
    pub fn new(kind: NodeKind) -> Node {
        Node {
            kind,
            children: Vec::new(),
            parent: None
        }
    }

    pub fn is_leaf(&self) -> bool {
        self.children.len() == 0
    }

    // I really hope I don't push nodes at children explicitly
    pub fn add_node(&mut self, node: Node) {

        if let None = node.parent {
            // If this node has no children, create one and push it
            // Empty Vec<T> doesn't allocate memory, so this is OK
            // self.children.push(Node::new(NodeKind::Root));
            node.parent = Some<&self>;
        }
        self.children.push(node);
    }

  // TODO(ioxua-os): Implement remove() and util methods here
}

## lib_domain.rs
#[derive(Debug)]
pub struct Website {
    src: String
}

#[derive(Debug)]
pub struct Article {
    src: String,
    content: Node
}

## lib_fetcher.rs
extern crate reqwest; // 0.8.6

use domain;

pub trait Fetcher {
    fn fetch(&self, website:Website) -> Result<Vec<Article>, Error>;
}

// Reqwest docs recommends us to instantiating a single client and reusing it
pub struct HttpFetcher {
    client: reqwest::Client
}

impl HttpFetcher {
    fn new() -> HttpFetcher {
        HttpFetcher {
          client: reqwest::Client::new()
        }
    }
}

impl Fetcher for HttpFetcher {
    fn fetch(&self, website: Website) -> Result<Vec<Article>, Error> {
        // This next line should be self.client.get(website.src)
        // But for some reason, IntoUrl is not implemented for String. yet (?)
        let res = self.client
          .get(&website.src[..])
          .send();

        // TODO(ioxua-os): Refactor this error handling.
        // We should consider every case.
        // See https://docs.rs/reqwest/0.8.6/reqwest/struct.Error.html
        if let Err(e) = res {
            let mut details = if e.is_http() {
                "HTTP-related exception".to_string()
            } else if e.is_serialization() {
                "Serialization error".to_string()
            } else if e.is_redirect() {
                "Redirection error".to_string()
            } else if e.is_client_error() {
                "Client error".to_string()
            } else if e.is_server_error() {
                "Server error".to_string()
            } else {
                // Too generic of an error message?
                "The request couldn't succeed".to_string()
            };

            if let Some(code) = e.status() {
                details = format!("{}: Response status {}", details, code)
            }

            let mut msg = format!("Error acquiring {}", url=website.src);
            if let Some(url) = e.url() {
                msg = format!("Error acquiring {}", url)
            }

            return Err(Error {
                msg,
                details
            })
        }

        let mut articles: Vec<Article> = Vec::new();

        // Fetch the articles here yaaay

        Ok(articles)
    }
}

## lib_mod.rs
mod aam;
mod domain;
mod utils;
mod fetcher;

## lib_utils.rs
#[derive(Debug)]
pub struct Error {
    msg: String, details: String
}

## mappings.json
/*
  The idea here is mapping the configuration into a single .json file in order to
  kill the necessity to implement a custom parser for each site we will be scraping.

  selectorType - For now we will support CSS only. But nothing keeps us from upgrading
  articleSelector - The "base" selector. Each result from this selection MUST resuln
    in exactly one article. Every other selector will be executed relative to the
    results of this one.
  Every other selector must yield exactly one result too.
*/

[
  {
    "url": "https://medium.freecodecamp.org",
    "selectorType": "css",
    "articleSelector": ".postArticle",
    "linkSelector": "a[href]",
    "titleSelector": "postArticle-content h3",
    "subtitleSelector": "postArticle-content p"
  }
]

## src_main.rs
extern crate newscrape;

use newscrape::fetcher;
use newscrape::aam; // Abstract Article Model
use newscrape::domain::Website;

fn main() {
    // Maintaning the scope restricted for readability
    {
        // This should be static but I'm dumb
        let free_code_camp = Website {
            src: String::from("https://medium.freecodecamp.org")
        };

        let fetcher = fetcher::HttpFetcher::new();
        match fetcher.fetch(free_code_camp) {
            Ok(res) => {
                println!("Download finished {:?}", res)
            },
            Err(e) => {
                println!("Error: {:?}", e)
            }
        };
    }
    {
        let str = "Lorem ipsum dolor sit amet...".to_string();
        let text = aam::NodeKind::Text(str.clone());
        let mut node = aam::Node::new(text);

        assert_eq!(node.is_leaf(), true);

        let node2 = aam::Node::new(aam::NodeKind::Text(str.clone()));
        node.children.push(node2);

        assert_eq!(node.is_leaf(), false);
        assert_eq!(node.children.len(), 1);
    }
}
	// Abstract Article Model
	#[derive(Debug)]
	pub enum NodeKind {
	// Should I use structs or tuples here?
	Root, // Main node
	Text(String),
	Image {
	src: String, alt: String, legend: String
	},
	Citation {
	text: String, author: String
	},
	Title(String),
	Subtitle(String)
	}

	#[derive(Debug)]
	pub struct Node {
	kind: NodeKind,
	children: Vec<Node>,
	parent: Option<&Node>
	}

	impl Node {
	pub fn new(kind: NodeKind) -> Node {
	Node {
	kind,
	children: Vec::new(),
	parent: None
	}
	}

	pub fn is_leaf(&self) -> bool {
	self.children.len() == 0
	}

	// I really hope I don't push nodes at children explicitly
	pub fn add_node(&mut self, node: Node) {

	if let None = node.parent {
	// If this node has no children, create one and push it
	// Empty Vec<T> doesn't allocate memory, so this is OK
	// self.children.push(Node::new(NodeKind::Root));
	node.parent = Some<&self>;
	}
	self.children.push(node);
	}

	// TODO(ioxua-os): Implement remove() and util methods here
	}
	#[derive(Debug)]
	pub struct Website {
	src: String
	}

	#[derive(Debug)]
	pub struct Article {
	src: String,
	content: Node
	}
	extern crate reqwest; // 0.8.6

	use domain;

	pub trait Fetcher {
	fn fetch(&self, website:Website) -> Result<Vec<Article>, Error>;
	}

	// Reqwest docs recommends us to instantiating a single client and reusing it
	pub struct HttpFetcher {
	client: reqwest::Client
	}

	impl HttpFetcher {
	fn new() -> HttpFetcher {
	HttpFetcher {
	client: reqwest::Client::new()
	}
	}
	}

	impl Fetcher for HttpFetcher {
	fn fetch(&self, website: Website) -> Result<Vec<Article>, Error> {
	// This next line should be self.client.get(website.src)
	// But for some reason, IntoUrl is not implemented for String. yet (?)
	let res = self.client
	.get(&website.src[..])
	.send();

	// TODO(ioxua-os): Refactor this error handling.
	// We should consider every case.
	// See https://docs.rs/reqwest/0.8.6/reqwest/struct.Error.html
	if let Err(e) = res {
	let mut details = if e.is_http() {
	"HTTP-related exception".to_string()
	} else if e.is_serialization() {
	"Serialization error".to_string()
	} else if e.is_redirect() {
	"Redirection error".to_string()
	} else if e.is_client_error() {
	"Client error".to_string()
	} else if e.is_server_error() {
	"Server error".to_string()
	} else {
	// Too generic of an error message?
	"The request couldn't succeed".to_string()
	};

	if let Some(code) = e.status() {
	details = format!("{}: Response status {}", details, code)
	}

	let mut msg = format!("Error acquiring {}", url=website.src);
	if let Some(url) = e.url() {
	msg = format!("Error acquiring {}", url)
	}

	return Err(Error {
	msg,
	details
	})
	}

	let mut articles: Vec<Article> = Vec::new();

	// Fetch the articles here yaaay

	Ok(articles)
	}
	}
	#[derive(Debug)]
	pub struct Error {
	msg: String, details: String
	}
	/*
	The idea here is mapping the configuration into a single .json file in order to
	kill the necessity to implement a custom parser for each site we will be scraping.

	selectorType - For now we will support CSS only. But nothing keeps us from upgrading
	articleSelector - The "base" selector. Each result from this selection MUST resuln
	in exactly one article. Every other selector will be executed relative to the
	results of this one.
	Every other selector must yield exactly one result too.
	*/

	[
	{
	"url": "https://medium.freecodecamp.org",
	"selectorType": "css",
	"articleSelector": ".postArticle",
	"linkSelector": "a[href]",
	"titleSelector": "postArticle-content h3",
	"subtitleSelector": "postArticle-content p"
	}
	]
	extern crate newscrape;

	use newscrape::fetcher;
	use newscrape::aam; // Abstract Article Model
	use newscrape::domain::Website;

	fn main() {
	// Maintaning the scope restricted for readability
	{
	// This should be static but I'm dumb
	let free_code_camp = Website {
	src: String::from("https://medium.freecodecamp.org")
	};

	let fetcher = fetcher::HttpFetcher::new();
	match fetcher.fetch(free_code_camp) {
	Ok(res) => {
	println!("Download finished {:?}", res)
	},
	Err(e) => {
	println!("Error: {:?}", e)
	}
	};
	}
	{
	let str = "Lorem ipsum dolor sit amet...".to_string();
	let text = aam::NodeKind::Text(str.clone());
	let mut node = aam::Node::new(text);

	assert_eq!(node.is_leaf(), true);

	let node2 = aam::Node::new(aam::NodeKind::Text(str.clone()));
	node.children.push(node2);

	assert_eq!(node.is_leaf(), false);
	assert_eq!(node.children.len(), 1);
	}
	}