Skip to content

Instantly share code, notes, and snippets.

@kdauria
Created July 22, 2023 22:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kdauria/420ca40aaa89cfc8156f0e1793c0ebd3 to your computer and use it in GitHub Desktop.
Save kdauria/420ca40aaa89cfc8156f0e1793c0ebd3 to your computer and use it in GitHub Desktop.
Read BAM with noodles that has a non-compliant platform tag
use std::env;
use std::fs::File;
use std::io;
use std::io::Read;
use std::path::Path;
use std::process;
use byteorder::{LittleEndian, ReadBytesExt};
use noodles::bam;
use noodles::bgzf::{self as bgzf, VirtualPosition};
use noodles::sam;
/// Read a header of a BAM file, ignoring the PL tag in any read group
///
/// This is because noodles follows SAM specification, but not everyone does
fn read_header(path: &Path) -> io::Result<(sam::Header, VirtualPosition)> {
let mut bgzf_reader = bgzf::Reader::new(File::open(path)?);
let mut magic_number = [0; 4];
bgzf_reader.read_exact(&mut magic_number)?;
const EXPECTED_MAGIC_NUMBER: &[u8; 4] = b"BAM\x01";
let is_valid_bam = &magic_number == EXPECTED_MAGIC_NUMBER;
if !is_valid_bam {
let error_message = "Invalid BAM file";
return Err(io::Error::new(io::ErrorKind::InvalidData, error_message));
}
let header_text_length = bgzf_reader.read_u32::<LittleEndian>()?;
let mut header_text_bytes = vec![0; header_text_length as usize];
bgzf_reader.read_exact(&mut header_text_bytes)?;
let header_text = String::from_utf8_lossy(&header_text_bytes).into_owned();
let header_text_without_platform_tags = remove_platform_tags(header_text.as_str());
let header = header_text_without_platform_tags
.parse()
.map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "Invalid SAM header"))?;
// Move the file cursor to the end of the binary reference sequence dictionary
let n_ref = bgzf_reader.read_u32::<LittleEndian>().unwrap() as usize;
for _ in 0..n_ref {
let byte_count = bgzf_reader.read_u32::<LittleEndian>().unwrap() as usize;
let mut discard_buffer = vec![0; byte_count];
bgzf_reader.read_exact(&mut discard_buffer).unwrap();
bgzf_reader.read_u32::<LittleEndian>().unwrap();
}
let records_position: VirtualPosition = bgzf_reader.virtual_position();
Ok((header, records_position))
}
fn remove_platform_tags(header: &str) -> String {
header
.lines()
.map(|line| {
if line.starts_with("@RG") {
let fields: Vec<&str> = line
.split('\t')
.filter(|field| !field.starts_with("PL:"))
.collect();
fields.join("\t")
} else {
line.to_string()
}
})
.collect::<Vec<String>>()
.join("\n")
}
fn main() -> io::Result<()> {
// Skip the first argument, which is the program name
let args: Vec<String> = env::args().skip(1).collect();
if args.len() != 1 {
eprintln!("Usage: <program> <bam_file_path>");
process::exit(1);
}
let path = Path::new(&args[0]);
let (header, records_position) = match read_header(&path) {
Ok((header, records_position)) => {
println!(
"Successfully read header. Records start at position: {:?}",
records_position
);
(header, records_position)
}
Err(e) => {
eprintln!("Failed to read header: {}", e);
process::exit(1);
}
};
let mut reader = bam::reader::Builder.build_from_path(path)?;
reader.seek(records_position)?;
let mut read_counter = 0;
for _ in reader.records(&header) {
read_counter += 1;
}
println!("Number of reads: {}", read_counter);
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment