Skip to content

Instantly share code, notes, and snippets.

@kiwiyou
Created December 27, 2019 18:13
Show Gist options
  • Save kiwiyou/6310599e0fd647e6c896f94b4569a47b to your computer and use it in GitHub Desktop.
Save kiwiyou/6310599e0fd647e6c896f94b4569a47b to your computer and use it in GitHub Desktop.
네이버 한자사전 고사성어 크롤러
[package]
name = "hanja-idiom"
version = "0.1.0"
authors = ["kiwiyou <kiwiyou@protonmail.com>"]
edition = "2018"
[dependencies]
reqwest = "0.9"
unhtml = "0.7"
unhtml_derive = "0.7"
serde = "1.0"
serde_derive = "1.0"
csv = "1.1"
rayon = "1.3"
use rayon::prelude::*;
use reqwest::get;
use serde_derive::*;
use unhtml::{self, FromHtml};
use unhtml_derive::*;
#[derive(FromHtml)]
#[html(selector = ".tab_submenu")]
struct OnsetTab {
#[html(selector = "li")]
onsets: Vec<Onset>,
}
#[derive(FromHtml)]
struct Onset;
#[derive(FromHtml)]
#[html(selector = ".sub_word")]
struct InitialTab {
#[html(selector = "span", attr = "inner")]
initials: Vec<String>,
}
#[derive(Debug, FromHtml)]
#[html(selector = "#content")]
struct WordList {
#[html(selector = "dd:not([class])", attr = "inner")]
readings: Vec<String>,
#[html(selector = "dd.meaning", attr = "inner")]
meanings: Vec<String>,
#[html(selector = ".paginate > *:not([class])", attr = "inner")]
pages: Vec<usize>,
#[html(selector = ".paginate > .next", attr = "inner")]
next: Option<String>,
}
fn parse_onset_tab() -> OnsetTab {
let html = get("https://hanja.dict.naver.com/category/idiom")
.unwrap()
.text()
.unwrap();
let tab = OnsetTab::from_html(&html).unwrap();
tab
}
fn parse_initial_tab(index: usize) -> InitialTab {
let query = format!("https://hanja.dict.naver.com/category/idiom?idx={}", index);
let html = get(&query).unwrap().text().unwrap();
let tab = InitialTab::from_html(&html).unwrap();
tab
}
fn parse_word_list<S: AsRef<str>>(initial: S, page: usize) -> WordList {
let query = format!(
"https://hanja.dict.naver.com/category/idiom?q={}&pageNo={}",
initial.as_ref(),
page
);
let html = get(&query).unwrap().text().unwrap();
let tab = WordList::from_html(&html).unwrap();
tab
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "PascalCase")]
struct Idiom {
reading: String,
meaning: String,
}
fn crawl_for_initial<S: AsRef<str>>(initial: S) -> Vec<Idiom> {
let initial_ref = initial.as_ref();
let sample_list = sample_for_initial(initial_ref);
sample_list
.par_iter()
.flat_map(|sample| {
let first = sample
.readings
.par_iter()
.zip(sample.meanings.par_iter())
.map(|(reading, meaning)| Idiom {
reading: reading.clone(),
meaning: meaning.clone(),
})
.collect::<Vec<_>>();
let from_second = sample
.pages
.par_iter()
.skip(1)
.map(|page| parse_word_list(initial_ref, *page))
.flat_map(|list| {
list.readings
.par_iter()
.zip(list.meanings.par_iter())
.map(|(reading, meaning)| Idiom {
reading: reading.clone(),
meaning: meaning.clone(),
})
.collect::<Vec<_>>()
});
first
.par_iter()
.cloned()
.chain(from_second)
.collect::<Vec<_>>()
})
.collect()
}
fn sample_for_initial<S: AsRef<str>>(initial: S) -> Vec<WordList> {
let initial_ref = initial.as_ref();
let mut sample_page = 1;
let mut sample_list = Vec::new();
loop {
let sample = parse_word_list(initial_ref, sample_page);
let is_none = sample.next.is_none();
sample_list.push(sample);
if is_none {
break;
}
sample_page += 10;
}
sample_list
}
fn main() {
println!("[+] crawling onset list...");
let onset_tab = parse_onset_tab();
let initial_tabs = onset_tab
.onsets
.par_iter()
.enumerate()
.map(|(i, _)| parse_initial_tab(i));
println!("[+] crawling idiom list...");
let mut idioms = initial_tabs
.flat_map(|tab| {
tab.initials
.par_iter()
.flat_map(|initial| crawl_for_initial(initial))
.collect::<Vec<_>>()
})
.collect::<Vec<_>>();
idioms.sort_unstable_by(|a, b| a.reading.partial_cmp(&b.reading).unwrap());
println!("[+] saving idioms...");
let mut writer = csv::WriterBuilder::new()
.delimiter(b',')
.quote_style(csv::QuoteStyle::NonNumeric)
.from_path("idiom.csv")
.unwrap();
for idiom in idioms.iter() {
writer.serialize(idiom).unwrap();
}
writer.flush().unwrap();
println!("[+] list saved into idiom.csv!");
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn onset_tab_should_parse_correct_onset_list() {
let tab = parse_onset_tab();
assert_eq!(14, tab.onsets.len());
}
#[test]
fn initial_tab_should_parse_correct_initial_list() {
let initial_tab = parse_initial_tab(0);
assert_eq!(55, initial_tab.initials.len());
}
#[test]
fn word_list_should_have_equal_lengths_for_readings_and_meanings() {
let word_list = parse_word_list("가", 0);
assert_eq!(word_list.readings.len(), word_list.meanings.len());
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment