Response to "My first project in Rust (port of a Python library)"
| [package] | |
| name = "youtube" | |
| version = "0.1.0" | |
| authors = ["n1"] | |
| edition = "2018" | |
| [dependencies] | |
| chrono = "0.4" | |
| crossbeam-channel = "0.3" | |
| env_logger = "0.6" | |
| failure = "0.1" | |
| futures = "0.1" | |
| lazy_static = "1.2" | |
| log = "0.4" | |
| reqwest = "0.9" | |
| regex = "1.1" | |
| structopt = "0.2" | |
| tokio = "0.1" |
| use std::collections::HashSet; | |
| use std::fs::{self, File}; | |
| use std::io::{self, Write}; | |
| use std::path::{Path, PathBuf}; | |
| use std::time::Duration; | |
| use chrono::NaiveDate; | |
| use failure::Error; | |
| use futures::future::lazy; | |
| use futures::{Future, Stream}; | |
| use lazy_static::lazy_static; | |
| use log::{error, info}; | |
| use regex::Regex; | |
| use reqwest::r#async::Client; | |
| use reqwest::{IntoUrl, Url}; | |
| use structopt::StructOpt; | |
| // StructOpt is really great for cli applications (it parses arguments | |
| // and automagically gives you nice extras like output for --help) | |
| #[derive(Debug, StructOpt)] | |
| #[structopt(about = "TODO", author = "n1", name = "youtube", version = "0.1.0")] | |
| struct Opt { | |
| #[structopt(parse(from_os_str))] | |
| path: PathBuf, | |
| } | |
| fn main() -> Result<(), Error> { | |
| env_logger::init(); | |
| // parse command line argument | |
| let opt = Opt::from_args(); | |
| let inpath = &opt.path; | |
| // read urls from file | |
| let urls = read_urls(inpath)?; | |
| let no_urls = urls.len(); | |
| // asynchronously fetch urls and parse them into `Video`s | |
| // tokio, Rust's principle async libary, is asynchronous (like Python's | |
| // asyncio), but it's also multi-threaded; so we need to create | |
| // a channel through which we can communicate results across threads | |
| let (sender, receiver) = crossbeam_channel::unbounded(); | |
| // Here's the (current) syntax for spawning a bunch of asynchronous | |
| // work on tokio's event loop and sending the results back to the main | |
| // thread. This syntax is pretty hairy at the moment, but it should get | |
| // much better when the async keyword hits Rust later this year. The API | |
| // should become much more intuitive and Python-like. | |
| // | |
| // Since tokio manages the number of threads, I've left out code that | |
| // would split the work up into chunks, but you could add that (with | |
| // some more complication) if it's still necessary (i.e. if you're worried | |
| // about hitting servers with too many simultenous requests). | |
| tokio::run(lazy(move || { | |
| // We're using the reqwests library's async client | |
| let client = reqwest::r#async::Client::new(); | |
| for url in urls.into_iter() { | |
| // Make a separate copy of the sender for each `Future`, | |
| // since they could all potentially run on different threads | |
| let sender = sender.clone(); | |
| // Create the actual `Future` representing work we'd like | |
| // done asynchronously | |
| let future = fetch(&client, url.clone()) | |
| .and_then(|html| { | |
| let video = Video::new(&html, url); | |
| video.validate()?; | |
| Ok(video) | |
| }) | |
| .then(move |result| { | |
| // Send our maybe `Video` / maybe `Error` back to the | |
| // main thread | |
| sender.send(result).unwrap(); | |
| // Any `Future` spawned on tokio's `Runtime` has to | |
| // ultimately resolve into a Result<(), ()> (this is why | |
| // we use channels to pass our results back to the | |
| // main thread, as tokio::run can't return anything) | |
| Ok::<(), ()>(()) | |
| }); | |
| // "Spawn" that `Future` on the tokio runtime (event loop); | |
| // so that tokio does all it's async magic and we don't have | |
| // to worry about it | |
| tokio::spawn(future); | |
| } | |
| Ok(()) | |
| })); | |
| // collect on the main thread the maybe `Video`s / maybe `Error`s | |
| // produced from our asynchronous operations | |
| let mut videos = Vec::with_capacity(no_urls); | |
| for _ in 0..no_urls { | |
| // The `recv` method on `receiver` can fail, but we're also | |
| // receiving `Results`, which is why we need two question marks here | |
| let video = receiver.recv()??; | |
| videos.push(video) | |
| } | |
| // write output to file | |
| let outpath = create_outpath(inpath)?; | |
| write_to_file(&outpath, &videos)?; | |
| Ok(()) | |
| } | |
| struct Video { | |
| channel: Option<String>, | |
| date_published: Option<NaiveDate>, | |
| duration: Option<Duration>, | |
| title: Option<String>, | |
| url: Url, | |
| } | |
| impl Video { | |
| /// Parses HTML into a `Video` using regexes | |
| /// | |
| /// Note: Validation is done separately (by the `validate` method; see below) | |
| fn new(html: &str, url: Url) -> Video { | |
| // Use "lazy static" for regex compilation to ensure compilation only occurs once | |
| lazy_static! { | |
| static ref RE_CHANNEL: Regex = | |
| Regex::new(r#"author\\":\\"(.+?)\\""#).expect("Failed to compile channel regex"); | |
| static ref RE_DATE_PUBLISHED: Regex = Regex::new(r#"datePublished" content="(.+?)""#) | |
| .expect("Failed to compile date_published regex"); | |
| static ref RE_DURATION: Regex = Regex::new(r#"lengthSeconds\\":\\"(\d+)\\""#) | |
| .expect("Failed to compile duration regex"); | |
| static ref RE_TITLE: Regex = | |
| Regex::new(r"<title>(.+)</title>").expect("Failed to compile title regex"); | |
| } | |
| // Option<String> | |
| let channel = RE_CHANNEL | |
| .captures(&html) | |
| .and_then(|captures| captures.get(1)) | |
| .and_then(|m| Some(m.as_str().to_string())); | |
| // Option<NaiveDate> | |
| let date_published = RE_DATE_PUBLISHED | |
| .captures(&html) | |
| .and_then(|captures| captures.get(1)) | |
| .and_then(|m| { | |
| let s = m.as_str(); | |
| let date_published = NaiveDate::parse_from_str(s, "%Y-%m-%d").ok()?; | |
| Some(date_published) | |
| }); | |
| // Option<Duration> | |
| let duration = RE_DURATION | |
| .captures(html) | |
| .and_then(|captures| captures.get(1)) | |
| .and_then(|m| { | |
| let s = m.as_str(); | |
| let seconds = s.parse::<u64>().ok()?; | |
| let duration = Duration::from_secs(seconds); | |
| Some(duration) | |
| }); | |
| // Option<String> | |
| let title = RE_TITLE | |
| .captures(html) | |
| .and_then(|captures| captures.get(1)) | |
| .and_then(|m| Some(m.as_str().to_string())); | |
| Video { | |
| channel, | |
| date_published, | |
| duration, | |
| title, | |
| url, | |
| } | |
| } | |
| /// Validates `Video` | |
| fn validate(&self) -> Result<(), Error> { | |
| // Errors if channel is None (if it's OK for channel to be None, just remove this line) | |
| self.channel | |
| .as_ref() | |
| .ok_or_else(|| failure::err_msg("Missing channel"))?; | |
| // Errors if date_published is None (if it's OK for channel to be None, just remove this line) | |
| self.date_published | |
| .as_ref() | |
| .ok_or_else(|| failure::err_msg("Missing date published"))?; | |
| // Errors if duration is None (if it's OK for channel to be None, just remove this line) | |
| self.duration | |
| .as_ref() | |
| .ok_or_else(|| failure::err_msg("Missing duration"))?; | |
| // Errors if title is None (if it's OK for channel to be None, just remove this line) | |
| self.title | |
| .as_ref() | |
| .ok_or_else(|| failure::err_msg("Missing title"))?; | |
| Ok(()) | |
| } | |
| /// Serializes `Video` to anything that implements `io::Write` (e.g. a `File`) | |
| fn serialize<W>(&self, writer: &mut W) -> Result<(), io::Error> | |
| where | |
| W: io::Write, | |
| { | |
| // &str | |
| let channel = self | |
| .channel | |
| .as_ref() | |
| .map(|s| &**s) | |
| .unwrap_or_else(|| "[unknown]"); | |
| // String | |
| let date_published = self | |
| .date_published | |
| .as_ref() | |
| .map(|date| date.format("%Y-%m-%d").to_string()) | |
| .unwrap_or_else(|| "[someday]".to_string()); | |
| // u64 | |
| let duration = self | |
| .duration | |
| .as_ref() | |
| .map(|duration| duration.as_secs()) | |
| .unwrap_or_else(|| 0); | |
| // &str | |
| let title = self | |
| .title | |
| .as_ref() | |
| .map(|s| &**s) | |
| .unwrap_or_else(|| "[unknown]"); | |
| let s = format!( | |
| "#EXTINF:{},({} | {}) - {}\n{}\n", | |
| duration, channel, date_published, title, &self.url | |
| ); | |
| writer.write_all(s.as_bytes())?; | |
| Ok(()) | |
| } | |
| } | |
| /// Creates an outpath based on the inpath (prefixes the filename with "new_") | |
| fn create_outpath<P>(inpath: P) -> Result<PathBuf, Error> | |
| where | |
| P: AsRef<Path>, | |
| { | |
| let inpath = inpath.as_ref(); | |
| let parent = inpath.parent().ok_or_else(|| { | |
| failure::err_msg("Why did you pass the root directory as the path argument?") | |
| })?; | |
| let file_name = inpath | |
| .file_name() | |
| .ok_or_else(|| failure::err_msg("path argument must be a file"))? | |
| .to_str() | |
| .ok_or_else(|| failure::err_msg("path argument filename is not valid utf-8"))?; | |
| let new_file_name = format!("new_{}", file_name); | |
| let outpath = parent.join(new_file_name); | |
| Ok(outpath) | |
| } | |
| /// Returns a `Future` representing the result of fetching the provided URL | |
| /// and transforming it's body into a `String` | |
| fn fetch<U>(client: &Client, url: U) -> impl Future<Item = String, Error = Error> | |
| where | |
| U: IntoUrl, | |
| { | |
| client | |
| .get(url) | |
| .send() | |
| .map_err(|e| e.into()) | |
| .and_then(|response| { | |
| if !response.status().is_success() { | |
| return Err(failure::err_msg("TODO")); | |
| } | |
| Ok(response) | |
| }) | |
| .and_then(|response| { | |
| response | |
| .into_body() | |
| .concat2() | |
| .map_err(|_| failure::err_msg("TODO")) | |
| }) | |
| .map(|body| String::from_utf8_lossy(&body).into_owned()) | |
| } | |
| /// Reads the input file | |
| fn read_urls<P>(inpath: P) -> Result<Vec<Url>, Error> | |
| where | |
| P: AsRef<Path>, | |
| { | |
| // The most idiomatic way in Rust to read a file to into a `String` | |
| let content = fs::read_to_string(inpath)?; | |
| let mut no_duplicates = 0; | |
| let mut urls = HashSet::new(); | |
| for line in content.lines() { | |
| // Ignore comments | |
| if line.starts_with('#') { | |
| continue; | |
| } | |
| // Parse non-comment lines into `Url`s, ignoring errors | |
| // with the exception of just logging them | |
| let url = match Url::parse(line.trim()) { | |
| Ok(url) => url, | |
| Err(_) => { | |
| error!("failed to parse {:?} into a valid url", line); | |
| continue; | |
| } | |
| }; | |
| // Insert parsed `Url` into our `HashSet` (which assures no duplicates) | |
| let is_new = urls.insert(url); | |
| // ... but still keep track if we tried to insert something | |
| // that was already there so we can log the number of duplicates we | |
| // found | |
| if !is_new { | |
| no_duplicates += 1; | |
| } | |
| } | |
| // Log the number of duplicates, if any | |
| if no_duplicates > 0 { | |
| info!("removed {} duplicates", no_duplicates); | |
| } | |
| // Transform the `HashSet` into a `Vec` and return it | |
| Ok(urls.into_iter().collect()) | |
| } | |
| /// Writes a slice of `Video`s to a file | |
| fn write_to_file<P>(path: P, videos: &[Video]) -> Result<(), Error> | |
| where | |
| P: AsRef<Path>, | |
| { | |
| let mut f = File::create(&path)?; | |
| // Write header. | |
| f.write_all(b"# Created by YTitler\n")?; | |
| f.write_all(b"# See: https://gitlab.com/n1_/ytitler\n\n")?; | |
| f.write_all(b"#EXTM3U\n")?; | |
| // Write content. | |
| for video in videos { | |
| video.serialize(&mut f)?; | |
| } | |
| Ok(()) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment