Created
August 18, 2017 14:53
-
-
Save timgluz/8dfbbd182d9ae871b3f654da1294dc11 to your computer and use it in GitHub Desktop.
Double referring mutable pointer in the Struct impl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//use std::error::Error; | |
use std::fs::File; | |
use std::path::Path; | |
//TODO: reseacrc: will it be bottleneck, or BufferedWriter | |
use std::io::{Write, Read}; | |
use std::collections::{HashMap, HashSet}; | |
use serde::{Deserialize, Serialize}; | |
use rmp_serde::{Deserializer, Serializer}; | |
use document::{self, Document}; | |
pub struct IndexerError<'a> { | |
pub message: &'a str, | |
} | |
impl<'a> IndexerError<'a> { | |
fn new(msg: &'a str) -> IndexerError { | |
IndexerError { message: msg } | |
} | |
} | |
#[derive(Debug, Serialize, Deserialize)] | |
pub struct Index { | |
pub n_terms: usize, | |
pub n_docs: usize, | |
terms: HashMap<String, usize>, | |
documents: Vec<Document>, | |
term_doc_idx: Vec<Vec<usize>>, // matrix [0 -> [docID1, docID2]] | |
} | |
impl Index { | |
pub fn new() -> Index { | |
Index { | |
n_terms: 0, | |
n_docs: 0, | |
terms: HashMap::new(), // TODO: replace with fst? | |
documents: Vec::new(), | |
term_doc_idx: vec![], | |
} | |
} | |
pub fn add_term(&mut self, term: String) -> Option<usize> { | |
if self.terms.contains_key(&term) { | |
let term_id:usize = *self.terms.get(&term).unwrap(); | |
Some(term_id) | |
} else { | |
let term_id = self.n_terms; | |
self.terms.insert(term, term_id); | |
self.n_terms += 1; | |
Some(term_id) | |
} | |
} | |
pub fn add(&mut self, doc: Document ) -> Result<usize, IndexerError> { | |
let current_doc_id = self.n_docs; | |
//add document into documents; | |
self.documents.push( doc ); | |
self.n_docs += 1; | |
Ok(current_doc_id) | |
} | |
pub fn index_doc(&mut self, doc_id: usize) -> Result<usize, IndexerError> { | |
if doc_id >= self.documents.len() { | |
return Err(IndexerError::new("found no such document")); | |
} | |
// TODO: mover tokenizer into own module | |
let mut doc = self.documents[doc_id].clone(); | |
doc.tokenize(); | |
for term in doc.tcm.keys() { | |
match self.add_term(term.clone()) { | |
None => return Err(IndexerError::new("Failed to add term into index")), | |
Some(term_id) => { | |
// add document into term_doc_idx | |
self.add_doc_into_term_idx(term_id, doc_id); | |
} | |
} | |
} | |
Ok(doc_id) | |
} | |
pub fn index_all(&mut self) -> Result<usize, IndexerError> { | |
let mut n_success = 0; | |
let n_docs:u32 = self.n_docs as u32 ; | |
let mut this = &mut self; // from E0499, but doesnt work | |
for doc_id in 0..n_docs { | |
match self.index_doc(doc_id as usize) { | |
Ok(_) => n_success += 1, | |
Err(e) => return Err(e) | |
} | |
} | |
Ok(n_success) | |
} | |
pub fn get_docs_by_term(&self, term: String) -> Option<Vec<Document>> { | |
if !self.terms.contains_key(&term) { | |
return None; | |
} | |
let term_id = self.terms[&term]; | |
let docs = self.term_doc_idx[term_id] | |
.iter() | |
.fold(vec![], |mut acc, &id|{ | |
acc.push(self.documents[id].clone()); | |
acc | |
}); | |
Some(docs) | |
} | |
fn add_doc_into_term_idx(&mut self, term_id: usize, doc_id: usize) -> Option<usize> { | |
// term doesnt exist in the index | |
if term_id >= self.term_doc_idx.len() { | |
self.term_doc_idx.push(vec![]) | |
//self.term_doc_idx[term_id] = vec![] | |
}; | |
let doc_pos = self.term_doc_idx[term_id].len(); | |
//add doc into term index | |
self.term_doc_idx[term_id].push(doc_id); | |
Some(doc_pos) | |
} | |
} | |
// builds Index from json files found in the path | |
pub fn build_from_path<'a>(target_path: &'a str) -> Result<Index, IndexerError> { | |
let path = Path::new(target_path); | |
if !path.exists() { | |
return Err(IndexerError::new("target path doesnt exists or is not accessible")); | |
} | |
let mut idx = Index::new(); | |
// iterate over files and build docs and add them into index | |
for entry in path.read_dir().expect("read_dir failed") { | |
if let Ok(metadata) = entry { | |
// add new document into index only if parsing was successful | |
if let Ok(doc) = document::from_json_file(metadata.path()) { | |
idx.add(doc); | |
} | |
} | |
} | |
Ok(idx) | |
} | |
// dump index into file | |
pub fn save<'a>(idx: &Index, target_path: &'a str) -> Result<bool, IndexerError<'a>> { | |
let mut fp = match File::create(target_path) { | |
Ok(fp) => fp, | |
Err(_) => return Err(IndexerError::new("Failed to open targetfile")) | |
}; | |
let mut buf: Vec<u8> = Vec::new(); | |
idx.serialize(&mut Serializer::new(&mut buf)).expect("Failed to serialize index"); | |
fp.write_all(&buf).expect("Failed to write into file"); | |
fp.sync_all().expect("Failed to save file on the disk"); | |
Ok(true) | |
} | |
pub fn load<'a>(source_path: &'a str) -> Result<Index, IndexerError> { | |
let mut fp = match File::open(source_path) { | |
Ok(fp) => fp, | |
Err(_) => return Err(IndexerError::new("Failed to open sourcefile")) | |
}; | |
let mut buf = Vec::new(); | |
fp.read_to_end(&mut buf).expect("Failed to read a content of the sourcefile"); | |
let mut de = Deserializer::new(&buf[..]); | |
match Deserialize::deserialize(&mut de) { | |
Ok(idx) => Ok(idx), | |
Err(_) => Err(IndexerError::new("Failed to deserialize file buffer")) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment