Skip to content

Instantly share code, notes, and snippets.

@ronanyeah
Created April 4, 2020 03:02
Show Gist options
  • Save ronanyeah/c515615ddbf81c70aca17fceb3ca4d06 to your computer and use it in GitHub Desktop.
Save ronanyeah/c515615ddbf81c70aca17fceb3ca4d06 to your computer and use it in GitHub Desktop.
YouTube free movies scraper
use scraper::{Html, Selector};
use std::fs::File;
use std::io::prelude::*;
const USER_AGENT: &str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36";
const URL: &str = "https://www.youtube.com/feed/storefront?bp=kgEmCGQSIlBMSFBUeFR4dEMwaWJWWnJUMl9XS1dVbDJTQXhzS3VLd3iiBQIoAg%3D%3D";
#[derive(serde::Deserialize, serde::Serialize)]
struct Movie {
id: String,
img: String,
title: String,
}
async fn fetch(client: reqwest::Client) -> Result<String, reqwest::Error> {
let res = client
.get(URL)
.header("User-Agent", USER_AGENT)
.send()
.await;
match res {
Err(err) => Err(err),
Ok(data) => data.text().await,
}
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let client = reqwest::Client::new();
let txt = fetch(client).await?;
let selector = Selector::parse("script").expect("bad selector");
let document = Html::parse_document(&txt);
let sel = document.select(&selector);
let res = sel
.map(|element| element.text().collect::<Vec<_>>())
.find(|text| text.iter().any(|x| x.contains(&"ytInitialData")))
.expect("no yt");
let mut ok = res
.get(0)
.expect("bad index")
.split("\n")
.collect::<Vec<_>>()
.iter()
.filter(|x| !x.is_empty())
.collect::<Vec<_>>()
.get(0)
.expect("bad index 2")
.chars()
.skip(30)
.collect::<String>();
ok.pop();
let json: serde_json::Value = serde_json::from_str(&ok)?;
let inner = &json.as_object().expect("1")["contents"]
.as_object()
.expect("2")["twoColumnBrowseResultsRenderer"]
.as_object()
.expect("3")["tabs"][0]
.as_object()
.expect("4")["tabRenderer"]
.as_object()
.expect("5")["content"]
.as_object()
.expect("6")["sectionListRenderer"]
.as_object()
.expect("7")["contents"][0]
.as_object()
.expect("6")["itemSectionRenderer"]
.as_object()
.expect("7")["contents"][0]
.as_object()
.expect("8")["shelfRenderer"]
.as_object()
.expect("9")["content"]
.as_object()
.expect("10")["gridRenderer"]
.as_object()
.expect("11")["items"]
.as_array()
.unwrap()
.iter()
.map(|item| {
let obj = item.as_object().unwrap()["gridMovieRenderer"]
.as_object()
.unwrap();
Movie {
id: obj["videoId"].as_str().unwrap().to_string(),
img: obj["thumbnail"].as_object().unwrap()["thumbnails"][0]
.as_object()
.unwrap()["url"]
.as_str()
.unwrap()
.to_string(),
title: obj["title"].as_object().unwrap()["runs"][0]
.as_object()
.unwrap()["text"]
.as_str()
.unwrap()
.to_string(),
}
})
.collect::<Vec<_>>();
let mut file = File::create("./res.json")?;
file.write_all(serde_json::to_string(inner)?.as_bytes())?;
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment