Instantly share code, notes, and snippets.

@bcmyers /Cargo.toml
Last active Jan 12, 2019

Embed
What would you like to do?
Response to "My first project in Rust (port of a Python library)"
[package]
name = "youtube"
version = "0.1.0"
authors = ["n1"]
edition = "2018"
[dependencies]
chrono = "0.4"
crossbeam-channel = "0.3"
env_logger = "0.6"
failure = "0.1"
futures = "0.1"
lazy_static = "1.2"
log = "0.4"
reqwest = "0.9"
regex = "1.1"
structopt = "0.2"
tokio = "0.1"
use std::collections::HashSet;
use std::fs::{self, File};
use std::io::{self, Write};
use std::path::{Path, PathBuf};
use std::time::Duration;
use chrono::NaiveDate;
use failure::Error;
use futures::future::lazy;
use futures::{Future, Stream};
use lazy_static::lazy_static;
use log::{error, info};
use regex::Regex;
use reqwest::r#async::Client;
use reqwest::{IntoUrl, Url};
use structopt::StructOpt;
// StructOpt is really great for cli applications (it parses arguments
// and automagically gives you nice extras like output for --help)
#[derive(Debug, StructOpt)]
#[structopt(about = "TODO", author = "n1", name = "youtube", version = "0.1.0")]
struct Opt {
#[structopt(parse(from_os_str))]
path: PathBuf,
}
fn main() -> Result<(), Error> {
env_logger::init();
// parse command line argument
let opt = Opt::from_args();
let inpath = &opt.path;
// read urls from file
let urls = read_urls(inpath)?;
let no_urls = urls.len();
// asynchronously fetch urls and parse them into `Video`s
// tokio, Rust's principle async libary, is asynchronous (like Python's
// asyncio), but it's also multi-threaded; so we need to create
// a channel through which we can communicate results across threads
let (sender, receiver) = crossbeam_channel::unbounded();
// Here's the (current) syntax for spawning a bunch of asynchronous
// work on tokio's event loop and sending the results back to the main
// thread. This syntax is pretty hairy at the moment, but it should get
// much better when the async keyword hits Rust later this year. The API
// should become much more intuitive and Python-like.
//
// Since tokio manages the number of threads, I've left out code that
// would split the work up into chunks, but you could add that (with
// some more complication) if it's still necessary (i.e. if you're worried
// about hitting servers with too many simultenous requests).
tokio::run(lazy(move || {
// We're using the reqwests library's async client
let client = reqwest::r#async::Client::new();
for url in urls.into_iter() {
// Make a separate copy of the sender for each `Future`,
// since they could all potentially run on different threads
let sender = sender.clone();
// Create the actual `Future` representing work we'd like
// done asynchronously
let future = fetch(&client, url.clone())
.and_then(|html| {
let video = Video::new(&html, url);
video.validate()?;
Ok(video)
})
.then(move |result| {
// Send our maybe `Video` / maybe `Error` back to the
// main thread
sender.send(result).unwrap();
// Any `Future` spawned on tokio's `Runtime` has to
// ultimately resolve into a Result<(), ()> (this is why
// we use channels to pass our results back to the
// main thread, as tokio::run can't return anything)
Ok::<(), ()>(())
});
// "Spawn" that `Future` on the tokio runtime (event loop);
// so that tokio does all it's async magic and we don't have
// to worry about it
tokio::spawn(future);
}
Ok(())
}));
// collect on the main thread the maybe `Video`s / maybe `Error`s
// produced from our asynchronous operations
let mut videos = Vec::with_capacity(no_urls);
for _ in 0..no_urls {
// The `recv` method on `receiver` can fail, but we're also
// receiving `Results`, which is why we need two question marks here
let video = receiver.recv()??;
videos.push(video)
}
// write output to file
let outpath = create_outpath(inpath)?;
write_to_file(&outpath, &videos)?;
Ok(())
}
struct Video {
channel: Option<String>,
date_published: Option<NaiveDate>,
duration: Option<Duration>,
title: Option<String>,
url: Url,
}
impl Video {
/// Parses HTML into a `Video` using regexes
///
/// Note: Validation is done separately (by the `validate` method; see below)
fn new(html: &str, url: Url) -> Video {
// Use "lazy static" for regex compilation to ensure compilation only occurs once
lazy_static! {
static ref RE_CHANNEL: Regex =
Regex::new(r#"author\\":\\"(.+?)\\""#).expect("Failed to compile channel regex");
static ref RE_DATE_PUBLISHED: Regex = Regex::new(r#"datePublished" content="(.+?)""#)
.expect("Failed to compile date_published regex");
static ref RE_DURATION: Regex = Regex::new(r#"lengthSeconds\\":\\"(\d+)\\""#)
.expect("Failed to compile duration regex");
static ref RE_TITLE: Regex =
Regex::new(r"<title>(.+)</title>").expect("Failed to compile title regex");
}
// Option<String>
let channel = RE_CHANNEL
.captures(&html)
.and_then(|captures| captures.get(1))
.and_then(|m| Some(m.as_str().to_string()));
// Option<NaiveDate>
let date_published = RE_DATE_PUBLISHED
.captures(&html)
.and_then(|captures| captures.get(1))
.and_then(|m| {
let s = m.as_str();
let date_published = NaiveDate::parse_from_str(s, "%Y-%m-%d").ok()?;
Some(date_published)
});
// Option<Duration>
let duration = RE_DURATION
.captures(html)
.and_then(|captures| captures.get(1))
.and_then(|m| {
let s = m.as_str();
let seconds = s.parse::<u64>().ok()?;
let duration = Duration::from_secs(seconds);
Some(duration)
});
// Option<String>
let title = RE_TITLE
.captures(html)
.and_then(|captures| captures.get(1))
.and_then(|m| Some(m.as_str().to_string()));
Video {
channel,
date_published,
duration,
title,
url,
}
}
/// Validates `Video`
fn validate(&self) -> Result<(), Error> {
// Errors if channel is None (if it's OK for channel to be None, just remove this line)
self.channel
.as_ref()
.ok_or_else(|| failure::err_msg("Missing channel"))?;
// Errors if date_published is None (if it's OK for channel to be None, just remove this line)
self.date_published
.as_ref()
.ok_or_else(|| failure::err_msg("Missing date published"))?;
// Errors if duration is None (if it's OK for channel to be None, just remove this line)
self.duration
.as_ref()
.ok_or_else(|| failure::err_msg("Missing duration"))?;
// Errors if title is None (if it's OK for channel to be None, just remove this line)
self.title
.as_ref()
.ok_or_else(|| failure::err_msg("Missing title"))?;
Ok(())
}
/// Serializes `Video` to anything that implements `io::Write` (e.g. a `File`)
fn serialize<W>(&self, writer: &mut W) -> Result<(), io::Error>
where
W: io::Write,
{
// &str
let channel = self
.channel
.as_ref()
.map(|s| &**s)
.unwrap_or_else(|| "[unknown]");
// String
let date_published = self
.date_published
.as_ref()
.map(|date| date.format("%Y-%m-%d").to_string())
.unwrap_or_else(|| "[someday]".to_string());
// u64
let duration = self
.duration
.as_ref()
.map(|duration| duration.as_secs())
.unwrap_or_else(|| 0);
// &str
let title = self
.title
.as_ref()
.map(|s| &**s)
.unwrap_or_else(|| "[unknown]");
let s = format!(
"#EXTINF:{},({} | {}) - {}\n{}\n",
duration, channel, date_published, title, &self.url
);
writer.write_all(s.as_bytes())?;
Ok(())
}
}
/// Creates an outpath based on the inpath (prefixes the filename with "new_")
fn create_outpath<P>(inpath: P) -> Result<PathBuf, Error>
where
P: AsRef<Path>,
{
let inpath = inpath.as_ref();
let parent = inpath.parent().ok_or_else(|| {
failure::err_msg("Why did you pass the root directory as the path argument?")
})?;
let file_name = inpath
.file_name()
.ok_or_else(|| failure::err_msg("path argument must be a file"))?
.to_str()
.ok_or_else(|| failure::err_msg("path argument filename is not valid utf-8"))?;
let new_file_name = format!("new_{}", file_name);
let outpath = parent.join(new_file_name);
Ok(outpath)
}
/// Returns a `Future` representing the result of fetching the provided URL
/// and transforming it's body into a `String`
fn fetch<U>(client: &Client, url: U) -> impl Future<Item = String, Error = Error>
where
U: IntoUrl,
{
client
.get(url)
.send()
.map_err(|e| e.into())
.and_then(|response| {
if !response.status().is_success() {
return Err(failure::err_msg("TODO"));
}
Ok(response)
})
.and_then(|response| {
response
.into_body()
.concat2()
.map_err(|_| failure::err_msg("TODO"))
})
.map(|body| String::from_utf8_lossy(&body).into_owned())
}
/// Reads the input file
fn read_urls<P>(inpath: P) -> Result<Vec<Url>, Error>
where
P: AsRef<Path>,
{
// The most idiomatic way in Rust to read a file to into a `String`
let content = fs::read_to_string(inpath)?;
let mut no_duplicates = 0;
let mut urls = HashSet::new();
for line in content.lines() {
// Ignore comments
if line.starts_with('#') {
continue;
}
// Parse non-comment lines into `Url`s, ignoring errors
// with the exception of just logging them
let url = match Url::parse(line.trim()) {
Ok(url) => url,
Err(_) => {
error!("failed to parse {:?} into a valid url", line);
continue;
}
};
// Insert parsed `Url` into our `HashSet` (which assures no duplicates)
let is_new = urls.insert(url);
// ... but still keep track if we tried to insert something
// that was already there so we can log the number of duplicates we
// found
if !is_new {
no_duplicates += 1;
}
}
// Log the number of duplicates, if any
if no_duplicates > 0 {
info!("removed {} duplicates", no_duplicates);
}
// Transform the `HashSet` into a `Vec` and return it
Ok(urls.into_iter().collect())
}
/// Writes a slice of `Video`s to a file
fn write_to_file<P>(path: P, videos: &[Video]) -> Result<(), Error>
where
P: AsRef<Path>,
{
let mut f = File::create(&path)?;
// Write header.
f.write_all(b"# Created by YTitler\n")?;
f.write_all(b"# See: https://gitlab.com/n1_/ytitler\n\n")?;
f.write_all(b"#EXTM3U\n")?;
// Write content.
for video in videos {
video.serialize(&mut f)?;
}
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment