Skip to content

Instantly share code, notes, and snippets.

@sangelxyz
Created October 6, 2023 08:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sangelxyz/6d7c177ad521d6dd158bed97b45fbc29 to your computer and use it in GitHub Desktop.
Save sangelxyz/6d7c177ad521d6dd158bed97b45fbc29 to your computer and use it in GitHub Desktop.
data processing
fn main() {
let json_str = std::fs::read_to_string("../posts.json").unwrap();
let posts: Vec<Post> = from_str(&json_str).unwrap();
let start = Instant::now();
let mut post_tags_map: FxHashMap<&str, Vec<u16>> = FxHashMap::default();
for (post_idx, post) in posts.iter().enumerate() {
for tag in post.tags.iter() {
post_tags_map.entry(tag).or_default().push(post_idx as u16);
}
}
let start2 = Instant::now();
let related_posts: Vec<RelatedPosts<'_>> = posts
.iter()
.enumerate()
.map(|(post_idx, post)| {
// faster than allocating outside the loop
let mut tagged_post_count = vec![0u16; posts.len()];
post.tags
.iter()
.flat_map(|tag| post_tags_map.get::<str>(tag.as_ref()).into_iter().flatten())
.for_each(|&other_post_idx| tagged_post_count[other_post_idx as usize] += 1);
tagged_post_count[post_idx] = 0; // don't recommend the same post
let top = least_n(
NUM_TOP_ITEMS,
tagged_post_count
.iter()
.enumerate()
.map(|(post, &count)| PostCount {
post: post as u16,
count,
}),
);
//let related = top.map(|it| &posts[it.post as usize]).collect();
let related: Vec<&Post> = top.map(|it| &posts[it.post as usize]).collect();
RelatedPosts {
_id: &post._id,
tags: &post.tags,
related,
}
})
.collect();
let end2 = Instant::now();
let end = Instant::now();
// I have no explanation for why, but doing this before the print improves performance pretty
// significantly (15%) when using slices in the hashmap key and RelatedPosts
let json_str = serde_json::to_string(&related_posts).unwrap();
print!(
"Processing time (w/o IO): {:?} {:?}\n",
end.duration_since(start),
end2.duration_since(start2),
);
std::fs::write("../related_posts_rust.json", json_str).unwrap();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment