Created
December 12, 2023 19:56
-
-
Save vsbuffalo/958c6f95f90cbae329709bd9b581d8a3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::{io, marker::PhantomData}; | |
use std::io::BufRead; | |
use coitrees::{COITree, Interval, IntervalTree}; | |
use fnv::FnvBuildHasher; | |
use indexmap::IndexMap; | |
use ndarray::{Array2, Array1}; | |
use serde::Deserialize; | |
use thiserror::Error; | |
use super::utils::{InputFile, FileError, BinEdges}; | |
pub type GenomeCoord = i32; | |
type IntervalIndexMap = IndexMap<String, Vec<Interval<Option<usize>>>, FnvBuildHasher>; | |
type IntervalTreeMap = IndexMap<String, Vec<Interval<Option<usize>>>, FnvBuildHasher>; | |
#[derive(Debug, Error)] | |
pub enum GRangesError { | |
#[error("File reading eror: {0}")] | |
FileError(#[from] FileError), | |
#[error("Invalid column type entry: {0}")] | |
InvalidColumnType(String), | |
#[error("Error reading line from a range file")] | |
RangeFileReaderLineError, | |
#[error("IO error: {0}")] | |
IOError(#[from] io::Error), | |
#[error("Error parsing data")] | |
DataParserError, | |
} | |
#[derive(Debug, Clone)] | |
pub struct RangeEntry<T> { | |
pub chrom: String, | |
pub start: GenomeCoord, | |
pub end: GenomeCoord, | |
pub data: Option<T>, | |
} | |
struct RangeFileReader<'a, T> | |
where | |
T: RangeLineParser<T> + Default, | |
{ | |
path: &'a str, | |
parser: PhantomData<T>, | |
} | |
impl<'a, T> RangeFileReader<'a, T> | |
where T: RangeLineParser<T> + Default { | |
fn new(path: &'a str) -> Self { | |
RangeFileReader { path, parser: PhantomData } | |
} | |
fn load(&self) -> Result<GRanges<T>, GRangesError> { | |
let input_file = InputFile::new(self.path); | |
let buf_reader = input_file.reader()?; | |
let mut ranges: GRanges<T> = GRanges::default(); | |
for line in buf_reader.lines() { | |
let line = line.map_err(|_| GRangesError::RangeFileReaderLineError)?; | |
if line.starts_with("#") { | |
continue; | |
} | |
let entry = T::parse_line(&line)?; | |
ranges.push_entry(entry); | |
} | |
Ok(ranges) | |
} | |
} | |
pub trait RangeLineParser<T> { | |
fn parse_line(line: &str) -> Result<RangeEntry<T>, GRangesError>; | |
} | |
pub fn parse_column<T: std::str::FromStr>(column: &str, line: &str) -> Result<T, GRangesError> | |
where | |
<T as std::str::FromStr>::Err: std::fmt::Debug, | |
{ | |
column.parse::<T>().map_err(|_| GRangesError::InvalidColumnType(line.to_string())) | |
} | |
impl RangeLineParser<f64> for f64 { | |
fn parse_line(line: &str) -> Result<RangeEntry<f64>, GRangesError> { | |
let columns: Vec<&str> = line.split('\t').collect(); | |
if columns.len() < 3 { | |
let msg = format!("too few columns on line {:?}", line.clone()); | |
return Err(GRangesError::InvalidColumnType(msg)); | |
} | |
let chrom = parse_column(columns[0], line)?; | |
let start: GenomeCoord = parse_column(columns[1], line)?; | |
let end: GenomeCoord = parse_column(columns[2], line)?; | |
let score: f64 = parse_column(columns[3], line)?; | |
let entry = RangeEntry { | |
chrom, | |
start, | |
end, | |
data: Some(score), | |
}; | |
Ok(entry) | |
} | |
} | |
pub struct GRangesFlatIterator<'a, T> { | |
inner_iter: Box<dyn Iterator<Item = (&'a String, GenomeCoord, GenomeCoord, Option<&'a T>)> + 'a>, | |
} | |
impl<'a, T> GRangesFlatIterator<'a, T> | |
where T: Default | |
{ | |
fn new(granges: &'a GRanges<T>) -> Self { | |
let inner_iter = granges.ranges.iter().flat_map(move |(chrom, intervals)| { | |
intervals.iter().map(move |interval| { | |
( | |
chrom, | |
interval.first, | |
interval.last, | |
interval.metadata.and_then(|idx| granges.data.get(idx)), | |
) | |
}) | |
}); | |
GRangesFlatIterator { | |
inner_iter: Box::new(inner_iter), | |
} | |
} | |
} | |
impl<'a, T> Iterator for GRangesFlatIterator<'a, T> { | |
type Item = (&'a String, GenomeCoord, GenomeCoord, Option<&'a T>); | |
fn next(&mut self) -> Option<Self::Item> { | |
self.inner_iter.next() | |
} | |
} | |
// The | |
trait GRangesIndexer<T> { | |
fn get(&self, index: usize) -> Option<&T>; | |
} | |
impl<T> GRangesIndexer<T> for GenomicData<T> { | |
fn get(&self, index: usize) -> Option<&T> { | |
match self { | |
GenomicData::VecData(vec) => vec.get(index), | |
GenomicData::Array1Data(array) => { | |
array.get(index) | |
}, | |
GenomicData::Array2Data(array) => { | |
array.get((index, ..)) | |
} | |
} | |
} | |
} | |
#[derive(Clone)] | |
pub enum GenomicData<T> { | |
VecData(Vec<T>), | |
Array1Data(Array1<T>), | |
Array2Data(Array2<T>), | |
} | |
impl<T: Default> Default for GenomicData<T> { | |
fn default() -> Self { | |
GenomicData::VecData(Vec::new()) | |
} | |
} | |
#[derive(Default)] | |
pub struct GRanges<T> | |
where T: Default { | |
ranges: IntervalIndexMap, | |
trees: Option<IntervalTreeMap>, | |
seqlens: IndexMap<String, GenomeCoord>, | |
data: GenomicData<T>, | |
} | |
// Reading from file is defined if T has a RangeLineParser trait | |
impl<T> GRanges<T> | |
where T: RangeLineParser<T> + Default { | |
pub fn from_file(&mut self, path: &str) { | |
let reader: RangeFileReader<T> = RangeFileReader::new(path); | |
reader.load(); | |
} | |
} | |
impl<T> GRanges<T> | |
where T: Default { | |
pub fn push_entry(&mut self, entry: RangeEntry<T>) { | |
// take ownership of entry.chrom | |
self.push(entry.chrom, entry.start, entry.end, entry.data); | |
} | |
pub fn push(&mut self, chrom: String, | |
start: GenomeCoord, end: GenomeCoord, | |
data: Option<T>) -> () { | |
let data_index = match data { | |
Some(range_data) => { | |
match self.data { | |
GenomicData::VecData(vec) => { | |
// the index of the data about to be inserted | |
// is the current length | |
let idx = Some(vec.len()); | |
vec.push(range_data); | |
idx | |
}, | |
_ => panic!("only VecData GenomicData is appendable.") | |
} | |
}, | |
None => None | |
}; | |
// NOTE: internally we store ranges like [start, end), | |
// but coitrees is end-inclusive so we need | |
// to convert at that stage. | |
let range = Interval::new(start, end, data_index); | |
self.ranges.entry(chrom.to_string()) | |
.or_insert(Vec::new()) | |
.push(range); | |
} | |
pub fn flat_iter(&self) -> GRangesFlatIterator<T> { | |
GRangesFlatIterator::new(self) | |
} | |
pub fn calc_coitrees(&mut self) { | |
} | |
} | |
// trait for mapping the data struct to some sort of array | |
pub trait ToArray { | |
type Output; | |
fn to_array(&self) -> Self::Output; | |
} | |
impl<T> GRanges<T> | |
where T: ToArray + Default { | |
fn to_array(&self) -> T::Output { | |
self.flat_iter() | |
.map_filter(|(_, _, _, data)| data.to_array()).collect() | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment