skeptrunedev/trieve_better_chunker.rs

## trieve_better_chunker.rs

pub fn get_sentences(text: &String) -> Vec<String> {
    let split_sentence_regex = Regex::new(r"[.!?]+").expect("Invalid regex");
    let sentences: Vec<String> = split_sentence_regex
        .split(&text)
        .map(|x| x.to_string())
        .collect();
    sentences
}

pub fn get_words(text: &String) -> Vec<String> {
    let split_word_regex = Regex::new(r"\s+").expect("Invalid regex");
    let words: Vec<String> = split_word_regex
        .split(&text)
        .map(|x| x.to_string())
        .collect();
    words
}

pub fn percentage_english_words(text: &String, english_dict: Arc<HashMap<String, bool>>) -> f32 {
    let words = get_words(text);
    let mut english_word_count = 0;
    for word in words.iter() {
        if english_dict.contains_key(word) {
            english_word_count += 1;
        }
    }
    let percentage = (english_word_count as f32) / (words.len() as f32);
    percentage
}

pub fn loop_split_single_sentence(sentences: Vec<String>, word_limit: usize) -> Vec<String> {
    let first_sentence = sentences.first().unwrap_or(&"".to_string()).to_string();
    let mut max_single_sentence_word_count = get_words(&first_sentence).len();
    let first_sentence_word_count = max_single_sentence_word_count;
    let mut word_split_factor = 1;
    let mut new_sentences = sentences.clone();
    while max_single_sentence_word_count > word_limit {
        word_split_factor += 1;
        let mut words = get_words(&first_sentence);
        let new_word_size =
            ((first_sentence_word_count / word_split_factor) as f32).floor() as usize;
        let mut remainder = max_single_sentence_word_count % word_split_factor;
        let mut word_lengths = vec![new_word_size; word_split_factor];
        while remainder > 0 {
            word_lengths[remainder - 1] += 1;
            remainder -= 1;
        }

        new_sentences = vec![];
        for word_length in word_lengths.iter() {
            let new_sentence = words
                .iter()
                .take(*word_length)
                .map(|x| x.to_string())
                .collect::<Vec<String>>()
                .join(" ");
            new_sentences.push(new_sentence);
            words.drain(0..*word_length);
        }

        max_single_sentence_word_count = 0;
        for new_sentence in new_sentences.iter() {
            if get_words(&new_sentence).len() > max_single_sentence_word_count {
                max_single_sentence_word_count = get_words(&new_sentence).len();
            }
        }
    }

    new_sentences
}

#[derive(Debug, Clone)]
pub struct ParsedChunk {
    pub heading: String,
    pub content_items: Vec<String>,
}

impl ParsedChunk {
    pub fn new(heading: String, content_items: Vec<String>) -> ParsedChunk {
        ParsedChunk {
            heading,
            content_items,
        }
    }

    pub fn output(&self, english_dict: Arc<HashMap<String, bool>>) -> Vec<String> {
        let mut first_content_item = self
            .content_items
            .first()
            .unwrap_or(&"".to_string())
            .to_string();

        let recurring_space_regex = Regex::new(r"[\s]+").expect("Invalid regex");
        let recurring_tab_regex = Regex::new(r"[\t]+").expect("Invalid regex");
        let starting_newline_regex = Regex::new(r"^\n").expect("Invalid regex");
        let ending_newline_regex = Regex::new(r"\n$").expect("Invalid regex");

        first_content_item = recurring_space_regex
            .replace_all(&first_content_item, " ")
            .to_string();
        first_content_item = recurring_tab_regex
            .replace_all(&first_content_item, " ")
            .to_string();
        first_content_item = starting_newline_regex
            .replace_all(&first_content_item, "")
            .to_string();
        first_content_item = ending_newline_regex
            .replace_all(&first_content_item, "")
            .to_string();

        if get_words(&first_content_item).len() < 20 {
            return vec![];
        }

        let total_content = format!("{}{}", self.heading, first_content_item);

        let heading_word_count = get_words(&self.heading).len();
        let mut largest_content_item_word_count = get_words(&first_content_item).len();

        let mut split_factor = 1;
        let mut new_p_bodies = vec![first_content_item.clone()];

        let word_limit = 340;

        while (heading_word_count + largest_content_item_word_count) > word_limit {
            split_factor += 1;
            let mut sentences = get_sentences(&total_content);
            let new_html_size = ((sentences.len() / split_factor) as f32).floor() as usize;
            let mut remainder = sentences.len() % split_factor;
            let mut lengths = vec![new_html_size; split_factor];
            while remainder > 0 {
                lengths[remainder - 1] += 1;
                remainder -= 1;
            }
            lengths = lengths
                .iter()
                .filter_map(|x| if *x > 0 { Some(*x) } else { None })
                .collect();

            new_p_bodies = vec![];
            for length in lengths.iter() {
                let temp_sentences = sentences
                    .iter()
                    .take(*length)
                    .map(|x| x.to_string())
                    .collect::<Vec<String>>();
                let mut new_sentences = temp_sentences.clone();
                if length.clone() == (1 as usize) {
                    new_sentences = loop_split_single_sentence(
                        temp_sentences.clone(),
                        word_limit - heading_word_count,
                    );
                }

                for new_sentence in new_sentences.iter() {
                    new_p_bodies.push(new_sentence.to_string());
                }

                sentences.drain(0..*length);
            }

            largest_content_item_word_count = 0;
            for body in new_p_bodies.iter() {
                if get_words(&body).len() > largest_content_item_word_count {
                    largest_content_item_word_count = get_words(&body).len();
                }
            }
        }

        let mut html_chunks: Vec<String> = vec![];
        for body in new_p_bodies.iter() {
            let words = get_words(&body);
            let unique_english_words = words
                .iter()
                .filter_map(|x| {
                    if english_dict.contains_key(x) {
                        Some(x.to_string())
                    } else {
                        None
                    }
                })
                .collect::<Vec<String>>();
            let count_unique_english_words = unique_english_words.len();
            let english_percentage = percentage_english_words(body, english_dict.clone());
            if count_unique_english_words < 10 && english_percentage < 0.75 {
                continue;
            }
            if count_unique_english_words < 30 && english_percentage < 0.10 {
                continue;
            }

            let mut cur_html = "<div>".to_string();
            if self.heading.len() > 0 {
                cur_html.push_str(&format!("<h3>{}</h3>", self.heading));
            }
            cur_html.push_str(&format!("<p>{}</p>", body));
            cur_html.push_str("</div>");
            html_chunks.push(cur_html);
        }

        html_chunks
    }
}

pub fn chunk_html(html_content: &str, english_dict: Arc<HashMap<String, bool>>) -> Vec<String> {
    let mut cur_heading = "".to_string();
    let mut chunks: Vec<ParsedChunk> = vec![];

    let dom = Html::parse_document(html_content);
    let root = dom.root_element();

    let selector = Selector::parse("*").expect("valid selector");

    let children = dom.tree.clone().into_iter();
    log::info!("Found {} children", children.count());

    for node in children {
        log::info!("Processing node");
        let text = Element::t
        let words = get_words(&text);
        let tag_name = node.value().name();

        if words.len() == 0 {
            continue;
        }

        if vec!["h1", "h2", "h3", "h4", "h5", "h6"].contains(&tag_name) {
            cur_heading = text;
        } else if vec!["ul", "ol"].contains(&tag_name) {
            chunks.push(ParsedChunk::new(cur_heading.clone(), vec![text]));
        } else if vec!["p", "div"].contains(&tag_name) {
            let sub_children = node
                .select(&Selector::parse("b, i, em, strong").expect("valid selector"))
                .into_iter()
                .filter_map(|x| {
                    let text = x.text().collect::<String>();
                    if text != " " && text != "\n" {
                        Some(text)
                    } else {
                        None
                    }
                })
                .collect::<Vec<String>>();
            if sub_children.len() == 1 {
                cur_heading = sub_children
                    .first()
                    .expect("must exist at this point")
                    .clone();
                continue;
            }

            chunks.push(ParsedChunk::new(cur_heading.clone(), vec![text]));
            cur_heading = "".to_string();
        }
    }

    log::info!("Found {} chunks", chunks.len());

    let ret_chunks: Vec<String> = chunks
        .into_iter()
        .map(|x| x.output(english_dict.clone()))
        .flatten()
        .collect();
    log::info!("Returning {} chunks", ret_chunks.len());
    ret_chunks
}

	pub fn get_sentences(text: &String) -> Vec<String> {
	let split_sentence_regex = Regex::new(r"[.!?]+").expect("Invalid regex");
	let sentences: Vec<String> = split_sentence_regex
	.split(&text)
	.map(\|x\| x.to_string())
	.collect();
	sentences
	}

	pub fn get_words(text: &String) -> Vec<String> {
	let split_word_regex = Regex::new(r"\s+").expect("Invalid regex");
	let words: Vec<String> = split_word_regex
	.split(&text)
	.map(\|x\| x.to_string())
	.collect();
	words
	}

	pub fn percentage_english_words(text: &String, english_dict: Arc<HashMap<String, bool>>) -> f32 {
	let words = get_words(text);
	let mut english_word_count = 0;
	for word in words.iter() {
	if english_dict.contains_key(word) {
	english_word_count += 1;
	}
	}
	let percentage = (english_word_count as f32) / (words.len() as f32);
	percentage
	}

	pub fn loop_split_single_sentence(sentences: Vec<String>, word_limit: usize) -> Vec<String> {
	let first_sentence = sentences.first().unwrap_or(&"".to_string()).to_string();
	let mut max_single_sentence_word_count = get_words(&first_sentence).len();
	let first_sentence_word_count = max_single_sentence_word_count;
	let mut word_split_factor = 1;
	let mut new_sentences = sentences.clone();
	while max_single_sentence_word_count > word_limit {
	word_split_factor += 1;
	let mut words = get_words(&first_sentence);
	let new_word_size =
	((first_sentence_word_count / word_split_factor) as f32).floor() as usize;
	let mut remainder = max_single_sentence_word_count % word_split_factor;
	let mut word_lengths = vec![new_word_size; word_split_factor];
	while remainder > 0 {
	word_lengths[remainder - 1] += 1;
	remainder -= 1;
	}

	new_sentences = vec![];
	for word_length in word_lengths.iter() {
	let new_sentence = words
	.iter()
	.take(*word_length)
	.map(\|x\| x.to_string())
	.collect::<Vec<String>>()
	.join(" ");
	new_sentences.push(new_sentence);
	words.drain(0..*word_length);
	}

	max_single_sentence_word_count = 0;
	for new_sentence in new_sentences.iter() {
	if get_words(&new_sentence).len() > max_single_sentence_word_count {
	max_single_sentence_word_count = get_words(&new_sentence).len();
	}
	}
	}

	new_sentences
	}

	#[derive(Debug, Clone)]
	pub struct ParsedChunk {
	pub heading: String,
	pub content_items: Vec<String>,
	}

	impl ParsedChunk {
	pub fn new(heading: String, content_items: Vec<String>) -> ParsedChunk {
	ParsedChunk {
	heading,
	content_items,
	}
	}

	pub fn output(&self, english_dict: Arc<HashMap<String, bool>>) -> Vec<String> {
	let mut first_content_item = self
	.content_items
	.first()
	.unwrap_or(&"".to_string())
	.to_string();

	let recurring_space_regex = Regex::new(r"[\s]+").expect("Invalid regex");
	let recurring_tab_regex = Regex::new(r"[\t]+").expect("Invalid regex");
	let starting_newline_regex = Regex::new(r"^\n").expect("Invalid regex");
	let ending_newline_regex = Regex::new(r"\n$").expect("Invalid regex");

	first_content_item = recurring_space_regex
	.replace_all(&first_content_item, " ")
	.to_string();
	first_content_item = recurring_tab_regex
	.replace_all(&first_content_item, " ")
	.to_string();
	first_content_item = starting_newline_regex
	.replace_all(&first_content_item, "")
	.to_string();
	first_content_item = ending_newline_regex
	.replace_all(&first_content_item, "")
	.to_string();

	if get_words(&first_content_item).len() < 20 {
	return vec![];
	}

	let total_content = format!("{}{}", self.heading, first_content_item);

	let heading_word_count = get_words(&self.heading).len();
	let mut largest_content_item_word_count = get_words(&first_content_item).len();

	let mut split_factor = 1;
	let mut new_p_bodies = vec![first_content_item.clone()];

	let word_limit = 340;

	while (heading_word_count + largest_content_item_word_count) > word_limit {
	split_factor += 1;
	let mut sentences = get_sentences(&total_content);
	let new_html_size = ((sentences.len() / split_factor) as f32).floor() as usize;
	let mut remainder = sentences.len() % split_factor;
	let mut lengths = vec![new_html_size; split_factor];
	while remainder > 0 {
	lengths[remainder - 1] += 1;
	remainder -= 1;
	}
	lengths = lengths
	.iter()
	.filter_map(\|x\| if x > 0 { Some(x) } else { None })
	.collect();

	new_p_bodies = vec![];
	for length in lengths.iter() {
	let temp_sentences = sentences
	.iter()
	.take(*length)
	.map(\|x\| x.to_string())
	.collect::<Vec<String>>();
	let mut new_sentences = temp_sentences.clone();
	if length.clone() == (1 as usize) {
	new_sentences = loop_split_single_sentence(
	temp_sentences.clone(),
	word_limit - heading_word_count,
	);
	}

	for new_sentence in new_sentences.iter() {
	new_p_bodies.push(new_sentence.to_string());
	}

	sentences.drain(0..*length);
	}

	largest_content_item_word_count = 0;
	for body in new_p_bodies.iter() {
	if get_words(&body).len() > largest_content_item_word_count {
	largest_content_item_word_count = get_words(&body).len();
	}
	}
	}

	let mut html_chunks: Vec<String> = vec![];
	for body in new_p_bodies.iter() {
	let words = get_words(&body);
	let unique_english_words = words
	.iter()
	.filter_map(\|x\| {
	if english_dict.contains_key(x) {
	Some(x.to_string())
	} else {
	None
	}
	})
	.collect::<Vec<String>>();
	let count_unique_english_words = unique_english_words.len();
	let english_percentage = percentage_english_words(body, english_dict.clone());
	if count_unique_english_words < 10 && english_percentage < 0.75 {
	continue;
	}
	if count_unique_english_words < 30 && english_percentage < 0.10 {
	continue;
	}

	let mut cur_html = "<div>".to_string();
	if self.heading.len() > 0 {
	cur_html.push_str(&format!("<h3>{}</h3>", self.heading));
	}
	cur_html.push_str(&format!("<p>{}</p>", body));
	cur_html.push_str("</div>");
	html_chunks.push(cur_html);
	}

	html_chunks
	}
	}

	pub fn chunk_html(html_content: &str, english_dict: Arc<HashMap<String, bool>>) -> Vec<String> {
	let mut cur_heading = "".to_string();
	let mut chunks: Vec<ParsedChunk> = vec![];

	let dom = Html::parse_document(html_content);
	let root = dom.root_element();

	let selector = Selector::parse("*").expect("valid selector");

	let children = dom.tree.clone().into_iter();
	log::info!("Found {} children", children.count());

	for node in children {
	log::info!("Processing node");
	let text = Element::t
	let words = get_words(&text);
	let tag_name = node.value().name();

	if words.len() == 0 {
	continue;
	}

	if vec!["h1", "h2", "h3", "h4", "h5", "h6"].contains(&tag_name) {
	cur_heading = text;
	} else if vec!["ul", "ol"].contains(&tag_name) {
	chunks.push(ParsedChunk::new(cur_heading.clone(), vec![text]));
	} else if vec!["p", "div"].contains(&tag_name) {
	let sub_children = node
	.select(&Selector::parse("b, i, em, strong").expect("valid selector"))
	.into_iter()
	.filter_map(\|x\| {
	let text = x.text().collect::<String>();
	if text != " " && text != "\n" {
	Some(text)
	} else {
	None
	}
	})
	.collect::<Vec<String>>();
	if sub_children.len() == 1 {
	cur_heading = sub_children
	.first()
	.expect("must exist at this point")
	.clone();
	continue;
	}

	chunks.push(ParsedChunk::new(cur_heading.clone(), vec![text]));
	cur_heading = "".to_string();
	}
	}

	log::info!("Found {} chunks", chunks.len());

	let ret_chunks: Vec<String> = chunks
	.into_iter()
	.map(\|x\| x.output(english_dict.clone()))
	.flatten()
	.collect();
	log::info!("Returning {} chunks", ret_chunks.len());
	ret_chunks
	}