Skip to content

Instantly share code, notes, and snippets.

@vsbuffalo
Created December 12, 2023 19:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vsbuffalo/958c6f95f90cbae329709bd9b581d8a3 to your computer and use it in GitHub Desktop.
Save vsbuffalo/958c6f95f90cbae329709bd9b581d8a3 to your computer and use it in GitHub Desktop.
use std::{io, marker::PhantomData};
use std::io::BufRead;
use coitrees::{COITree, Interval, IntervalTree};
use fnv::FnvBuildHasher;
use indexmap::IndexMap;
use ndarray::{Array2, Array1};
use serde::Deserialize;
use thiserror::Error;
use super::utils::{InputFile, FileError, BinEdges};
pub type GenomeCoord = i32;
type IntervalIndexMap = IndexMap<String, Vec<Interval<Option<usize>>>, FnvBuildHasher>;
type IntervalTreeMap = IndexMap<String, Vec<Interval<Option<usize>>>, FnvBuildHasher>;
#[derive(Debug, Error)]
pub enum GRangesError {
#[error("File reading eror: {0}")]
FileError(#[from] FileError),
#[error("Invalid column type entry: {0}")]
InvalidColumnType(String),
#[error("Error reading line from a range file")]
RangeFileReaderLineError,
#[error("IO error: {0}")]
IOError(#[from] io::Error),
#[error("Error parsing data")]
DataParserError,
}
#[derive(Debug, Clone)]
pub struct RangeEntry<T> {
pub chrom: String,
pub start: GenomeCoord,
pub end: GenomeCoord,
pub data: Option<T>,
}
struct RangeFileReader<'a, T>
where
T: RangeLineParser<T> + Default,
{
path: &'a str,
parser: PhantomData<T>,
}
impl<'a, T> RangeFileReader<'a, T>
where T: RangeLineParser<T> + Default {
fn new(path: &'a str) -> Self {
RangeFileReader { path, parser: PhantomData }
}
fn load(&self) -> Result<GRanges<T>, GRangesError> {
let input_file = InputFile::new(self.path);
let buf_reader = input_file.reader()?;
let mut ranges: GRanges<T> = GRanges::default();
for line in buf_reader.lines() {
let line = line.map_err(|_| GRangesError::RangeFileReaderLineError)?;
if line.starts_with("#") {
continue;
}
let entry = T::parse_line(&line)?;
ranges.push_entry(entry);
}
Ok(ranges)
}
}
pub trait RangeLineParser<T> {
fn parse_line(line: &str) -> Result<RangeEntry<T>, GRangesError>;
}
pub fn parse_column<T: std::str::FromStr>(column: &str, line: &str) -> Result<T, GRangesError>
where
<T as std::str::FromStr>::Err: std::fmt::Debug,
{
column.parse::<T>().map_err(|_| GRangesError::InvalidColumnType(line.to_string()))
}
impl RangeLineParser<f64> for f64 {
fn parse_line(line: &str) -> Result<RangeEntry<f64>, GRangesError> {
let columns: Vec<&str> = line.split('\t').collect();
if columns.len() < 3 {
let msg = format!("too few columns on line {:?}", line.clone());
return Err(GRangesError::InvalidColumnType(msg));
}
let chrom = parse_column(columns[0], line)?;
let start: GenomeCoord = parse_column(columns[1], line)?;
let end: GenomeCoord = parse_column(columns[2], line)?;
let score: f64 = parse_column(columns[3], line)?;
let entry = RangeEntry {
chrom,
start,
end,
data: Some(score),
};
Ok(entry)
}
}
pub struct GRangesFlatIterator<'a, T> {
inner_iter: Box<dyn Iterator<Item = (&'a String, GenomeCoord, GenomeCoord, Option<&'a T>)> + 'a>,
}
impl<'a, T> GRangesFlatIterator<'a, T>
where T: Default
{
fn new(granges: &'a GRanges<T>) -> Self {
let inner_iter = granges.ranges.iter().flat_map(move |(chrom, intervals)| {
intervals.iter().map(move |interval| {
(
chrom,
interval.first,
interval.last,
interval.metadata.and_then(|idx| granges.data.get(idx)),
)
})
});
GRangesFlatIterator {
inner_iter: Box::new(inner_iter),
}
}
}
impl<'a, T> Iterator for GRangesFlatIterator<'a, T> {
type Item = (&'a String, GenomeCoord, GenomeCoord, Option<&'a T>);
fn next(&mut self) -> Option<Self::Item> {
self.inner_iter.next()
}
}
// The
trait GRangesIndexer<T> {
fn get(&self, index: usize) -> Option<&T>;
}
impl<T> GRangesIndexer<T> for GenomicData<T> {
fn get(&self, index: usize) -> Option<&T> {
match self {
GenomicData::VecData(vec) => vec.get(index),
GenomicData::Array1Data(array) => {
array.get(index)
},
GenomicData::Array2Data(array) => {
array.get((index, ..))
}
}
}
}
#[derive(Clone)]
pub enum GenomicData<T> {
VecData(Vec<T>),
Array1Data(Array1<T>),
Array2Data(Array2<T>),
}
impl<T: Default> Default for GenomicData<T> {
fn default() -> Self {
GenomicData::VecData(Vec::new())
}
}
#[derive(Default)]
pub struct GRanges<T>
where T: Default {
ranges: IntervalIndexMap,
trees: Option<IntervalTreeMap>,
seqlens: IndexMap<String, GenomeCoord>,
data: GenomicData<T>,
}
// Reading from file is defined if T has a RangeLineParser trait
impl<T> GRanges<T>
where T: RangeLineParser<T> + Default {
pub fn from_file(&mut self, path: &str) {
let reader: RangeFileReader<T> = RangeFileReader::new(path);
reader.load();
}
}
impl<T> GRanges<T>
where T: Default {
pub fn push_entry(&mut self, entry: RangeEntry<T>) {
// take ownership of entry.chrom
self.push(entry.chrom, entry.start, entry.end, entry.data);
}
pub fn push(&mut self, chrom: String,
start: GenomeCoord, end: GenomeCoord,
data: Option<T>) -> () {
let data_index = match data {
Some(range_data) => {
match self.data {
GenomicData::VecData(vec) => {
// the index of the data about to be inserted
// is the current length
let idx = Some(vec.len());
vec.push(range_data);
idx
},
_ => panic!("only VecData GenomicData is appendable.")
}
},
None => None
};
// NOTE: internally we store ranges like [start, end),
// but coitrees is end-inclusive so we need
// to convert at that stage.
let range = Interval::new(start, end, data_index);
self.ranges.entry(chrom.to_string())
.or_insert(Vec::new())
.push(range);
}
pub fn flat_iter(&self) -> GRangesFlatIterator<T> {
GRangesFlatIterator::new(self)
}
pub fn calc_coitrees(&mut self) {
}
}
// trait for mapping the data struct to some sort of array
pub trait ToArray {
type Output;
fn to_array(&self) -> Self::Output;
}
impl<T> GRanges<T>
where T: ToArray + Default {
fn to_array(&self) -> T::Output {
self.flat_iter()
.map_filter(|(_, _, _, data)| data.to_array()).collect()
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment