Skip to content

Instantly share code, notes, and snippets.

@zoldello
Created October 8, 2020 17:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zoldello/ac29a20bcf60134c321830ed6d80b120 to your computer and use it in GitHub Desktop.
Save zoldello/ac29a20bcf60134c321830ed6d80b120 to your computer and use it in GitHub Desktop.
Approach 1: Begins at line 298
import { DataLoader, BufferedDataLoader, DataMissingError, FileFormatError } from "../loader/DataLoader";
import { BinaryParser } from "../util/BinaryParser";
import { loadHeaderData, HeaderData, FileType } from "./BigWigHeaderReader";
import { loadSequenceRecord, loadSequence, SequenceRecord, streamSequence } from "./TwoBitHeaderReader";
import { inflate } from "pako";
import { Stream, Readable, Writable, Duplex } from "stream";
import { start } from "repl";
export interface BigWigData {
chr: string,
start: number,
end: number,
value: number
}
export interface BigBedData {
chr: string,
start: number,
end: number,
name?: string,
score?: number,
strand?: string,
cdStart?: number,
cdEnd?: number,
color?: string,
exons?: Array<BigBedExon>
}
export interface BigBedDataNarrowPeak {
chr: string,
start: number,
end: number,
name?: string,
score?: number,
// + or - or . for unknown
strand?: string,
// Measurement of average enrichment for the region
signalValue?: number,
// Statistical significance of signal value (-log10). Set to -1 if not used
pValue?: number,
// Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used
qValue?: number,
// Point-source called for this peak; 0-based offset from chromStart. Set to -1 if no point-source called
peak?: number,
}
export interface BigBedDataBroadPeak {
chr: string,
start: number,
end: number,
name?: string,
score?: number,
// + or - or . for unknown
strand?: string,
// Measurement of average enrichment for the region
signalValue?: number,
// Statistical significance of signal value (-log10). Set to -1 if not used
pValue?: number,
// Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used
qValue?: number,
}
export interface BigBedDataRNAElement {
chr: string,
start: number,
end: number,
name?: string,
score?: number,
// + or - or . for unknown
strand?: string,
// Expression level such as RPKM or FPKM. Set to -1 for no data
level?: number,
// Statistical significance such as IDR. Set to -1 for no data
signif?: number,
// Additional measurement/count e.g. number of reads. Set to 0 for no data
score2?: number,
}
export interface BigBedDataMethyl {
chr: string,
start: number,
end: number,
name?: string,
score?: number,
strand?: string, // + or - or . for unknown
// Start of where display should be thick (start codon)
thickStart?: number,
// End of where display should be thick (stop codon)
thickEnd?: number,
// Color value R,G,B
reserved?: number,
// Number of reads or coverage
readCount?: number,
// Percentage of reads that show methylation at this position in the genome
percentMeth?: number
}
export interface BigBedDataTssPeak {
chr: string,
start: number,
end: number,
name?: string,
score?: number,
// + or - or . for unknown
strand?: string,
// Count of reads mapping to this peak
count?: number,
// Gene identifier
gene_id?: string,
// Gene name
gene_name?: string,
// TSS identifier
tss_id?: string,
// base by base read coverage of the peak
peak_cov?: string,
}
export interface BigBedDataIdrPeak {
chr: string,
start: number,
end: number,
name?: string,
score?: number,
// + or - or . for unknown
strand?: string,
// Local IDR value
localIDR?: number,
// Global IDR value
globalIDR?: number,
// Start position in chromosome of replicate 1 peak
rep1_chromStart?: number,
// End position in chromosome of replicate 1 peak
rep1_chromEnd?: number,
// Count (used for ranking) replicate 1
rep1_count?: number,
// Start position in chromosome of replicate 2 peak
rep2_chromStart?: number,
// End position in chromosome of replicate 2 peak
rep2_chromEnd?: number,
// Count (used for ranking) replicate 2
rep2_count?: number,
}
export interface BigBedDataIdrRankedPeak {
chr: string,
start: number,
end: number,
name?: string,
score?: number,
strand?: string, // + or - or . for unknown
signalValue?: number, // Measurement of enrichment for the region for merged peaks
pValue?: number, // p-value of merged peak
qValue?: number, // q-value of merged peak
summit?: number, // Summit of merged peak
localIDR?: number, // Local IDR value, which is -log10(local IDR value)
globalIDR?: number, // Global IDR value, which is -log10(global IDR value)
chromStart1?: number, // Start position in chromosome of peak 1
chromEnd1?: number, // End position in chromosome of peak 1
signalValue1?: number, // Signal measure from peak 1
summit1?: number, // Summit of peak 1
chromStart2?: number, // Start position in chromosome of peak 2
chromEnd2?: number, // End position in chromosome of peak 2
signalValue2?: number, // Signal measure from peak 2
summit2?: number, // Summit of peak 2
}
export interface BigBedExon {
start: number,
end: number
}
export interface BigZoomData {
chr: string,
start: number,
end: number,
validCount: number,
minVal: number,
maxVal: number,
sumData: number,
sumSquares: number
}
interface RPLeafNode {
startChrom: number;
startBase: number;
endChrom: number;
endBase: number;
dataOffset: number;
dataSize: number;
}
type ParseFunction<T> = (chrom: string, start: number, end: number, rest: string) => T;
const IDX_MAGIC = 0x2468ACE0;
const RPTREE_HEADER_SIZE = 48;
const RPTREE_NODE_LEAF_ITEM_SIZE = 32;
const RPTREE_NODE_CHILD_ITEM_SIZE = 24;
const DEFAULT_BUFFER_SIZE = 512000;
/**
* Main class for dealing with reading BigWig and BigBed files.
*/
export class BigWigReader {
private cachedHeader?: HeaderData;
private cachedSequenceRecords: { [name: string]: SequenceRecord } = {};
/**
* @param dataLoader Provided class that deals with fetching data from the file via http, local file, ftp, etc...
* @param bufferSize Size of the buffer used for fetching data. Used to optimistically read more data than is
* needed for each read of the tree that stores data to avoid round trips. The trade-off is potentially reading
* more data than you need to vs making more round trips.
*/
constructor(private dataLoader: DataLoader, private bufferSize: number = DEFAULT_BUFFER_SIZE) { }
/**
* Gets the type of the underlying file.
*/
async fileType(): Promise<FileType> {
let header: HeaderData = await this.getHeader();
return header.fileType;
}
/**
* Method for getting all header data for dataLoader's file. Data is loaded on demand and cached for subsequent requests.
*/
async getHeader(): Promise<HeaderData> {
if (!this.cachedHeader) {
this.cachedHeader = await loadHeaderData(this.dataLoader);
}
return this.cachedHeader;
}
/**
* Method for getting a sequence record from a 2bit sequence file. This method is not valid for bigWig or bigBed files.
*
* @param chrom the name of the chromosome or other sequence to retrieve.
*/
async getSequenceRecord(chrom: string): Promise<SequenceRecord> {
let header: HeaderData = await this.getHeader();
if (header.fileType !== FileType.TwoBit) throw new FileFormatError("getSequenceRecord is not valid on " + header.fileType + " files.");
if (!this.cachedSequenceRecords[chrom]) {
this.cachedSequenceRecords[chrom] = await loadSequenceRecord(this.dataLoader, header, chrom);
}
return this.cachedSequenceRecords[chrom];
}
/**
* Method for reading unzoomed wig data from BigWig files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
* @param zoomLevelIndex The ZoomLevelHeader.index from the zoom level you want to read from.
*/
async readBigWigData(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Array<BigWigData>> {
return this.readData<BigWigData>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeWigData);
}
/**
* Method for streaming unzoomed wig data from BigWig files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
* @param zoomLevelIndex The ZoomLevelHeader.index from the zoom level you want to read from.
*/
async streamBigWigData(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Readable> {
return this.streamData<BigWigData>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeWigData);
}
/**
* Method for reading unzoomed bed data from BigBed files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async readBigBedData(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Array<BigBedData>> {
return this.readData<BigBedData>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBedData);
}
//////////////////////////////////////
/**
* Approach
*/
async readBigBedData_approach1<T>(startChrom: string, startBase: number, endChrom: string,
endBase: number, decodeFunction: DecodeFunction<T> = decodeBedData): Promise<Array<T>> {
return this.readData<T>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeFunction);
}
/////////////////////////////////////
/**
* Method for reading unzoomed bed data from BigBedNarrowPeak files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async readBigBedDataNarrowPeak(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Array<BigBedDataNarrowPeak>> {
return this.readData<BigBedDataNarrowPeak>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBigBedDataNarrowPeak);
}
/**
* Method for reading unzoomed bed data from BigBedDataBroadPeak files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async readBigBedDataBroadPeak(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Array<BigBedDataBroadPeak>> {
return this.readData<BigBedDataBroadPeak>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBigBedDataBroadPeak);
}
/**
* Method for reading unzoomed bed data from BigBedDataRNAElement files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async readBigBedDataRNAElement(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Array<BigBedDataRNAElement>> {
return this.readData<BigBedDataRNAElement>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBigBedDataRNAElement);
}
/**
* Method for reading unzoomed bed data from BigBedDataMethyl files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async readBigBedDataMethyl(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Array<BigBedDataMethyl>> {
return this.readData<BigBedDataMethyl>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBigBedDataMethyl);
}
/**
* Method for reading unzoomed bed data from BigBedDataTssPeak files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async readBigBedDataTssPeak(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Array<BigBedDataTssPeak>> {
return this.readData<BigBedDataTssPeak>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBigBedDataTssPeak);
}
/**
* Method for reading unzoomed bed data from BigBedDataIdrPeak files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async readBigBedDataIdrPeak(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Array<BigBedDataIdrPeak>> {
return this.readData<BigBedDataIdrPeak>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBigBedDataIdrPeak);
}
/**
* Method for reading unzoomed bed data from BigBedDataIdrRankedPeak files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async readBigBedDataIdrRankedPeak(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Array<BigBedDataIdrRankedPeak>> {
return this.readData<BigBedDataIdrRankedPeak>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBigBedDataIdrRankedPeak);
}
/**
* Method for streaming unzoomed bed data from BigBed files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async streamBigBedData(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Readable> {
return this.streamData<BigBedData>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBedData);
}
/**
* Method for streaming unzoomed bed data from broad peak BigBed files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async streamBigBedDataBroadPeak(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Readable> {
return this.streamData<BigBedDataBroadPeak>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBigBedDataBroadPeak);
}
/**
* Method for streaming unzoomed bed data from narrow peak BigBed files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async streamBigBedDataNarrowPeak(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Readable> {
return this.streamData<BigBedDataNarrowPeak>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBigBedDataNarrowPeak);
}
/**
* Method for streaming unzoomed bed data from RNA element BigBed files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async streamBigBedDataRNAElement(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Readable> {
return this.streamData<BigBedDataRNAElement>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBigBedDataRNAElement);
}
/**
* Method for streaming unzoomed bed data from methyl BigBed files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async streamBigBedDataMethyl(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Readable> {
return this.streamData<BigBedDataMethyl>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBigBedDataMethyl);
}
/**
* Method for streaming unzoomed bed data from tss peak BigBed files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async streamBigBedDataTssPeak(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Readable> {
return this.streamData<BigBedDataTssPeak>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBigBedDataTssPeak);
}
/**
* Method for streaming unzoomed bed data from idr peak BigBed files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async streamBigBedDataIdrPeak(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Readable> {
return this.streamData<BigBedDataIdrPeak>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBigBedDataIdrPeak);
}
/**
* Method for streaming unzoomed bed data from idr ranked peak BigBed files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
*/
async streamBigBedDataIdrRankedPeak(startChrom: string, startBase: number, endChrom: string,
endBase: number): Promise<Readable> {
return this.streamData<BigBedDataIdrRankedPeak>(startChrom, startBase, endChrom, endBase,
(await this.getHeader()).common!.fullIndexOffset, decodeBigBedDataIdrRankedPeak);
}
/**
* Method for reading Two Bit sequence data from TwoBit files.
*
* @param chrom the chromosome from which to read.
* @param startBase the starting base.
* @param endBase the ending base.
*/
async readTwoBitData(chrom: string, startBase: number, endBase: number): Promise<string> {
const sequence: SequenceRecord = await this.getSequenceRecord(chrom);
return loadSequence(this.dataLoader, this.cachedHeader!, sequence, startBase, endBase);
}
/**
* Method for reading Two Bit sequence data from TwoBit files.
*
* @param chrom the chromosome from which to read.
* @param startBase the starting base.
* @param endBase the ending base.
*/
async streamTwoBitData(chrom: string, startBase: number, endBase: number, chunkSize: number = 1024): Promise<Readable> {
const sequence: SequenceRecord = await this.getSequenceRecord(chrom);
return streamSequence(this.dataLoader, this.cachedHeader!, sequence, startBase, endBase, chunkSize);
}
/**
* Method for reading zoomed data from BigWig and BigBed files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
* @param zoomLevelIndex index of the zoom level. You can call getHeader() for a list of these values under HeaderData.zoomLevelHeaders.
*/
async readZoomData(startChrom: string, startBase: number, endChrom: string, endBase: number,
zoomLevelIndex: number): Promise<Array<BigZoomData>> {
const header = await this.getHeader();
if (undefined == header.zoomLevelHeaders || !(zoomLevelIndex in header.zoomLevelHeaders)) {
throw new FileFormatError("Given zoomLevelIndex not found in zoom level headers.");
}
const treeOffset = header.zoomLevelHeaders[zoomLevelIndex].indexOffset;
return this.readData<BigZoomData>(startChrom, startBase, endChrom, endBase,
treeOffset, decodeZoomData);
}
/**
* Method for streaming zoomed data from BigWig and BigBed files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromose
* @param endBase Ending base pair
* @param zoomLevelIndex index of the zoom level. You can call getHeader() for a list of these values under HeaderData.zoomLevelHeaders.
*/
async streamZoomData(startChrom: string, startBase: number, endChrom: string, endBase: number,
zoomLevelIndex: number): Promise<Readable> {
const header = await this.getHeader();
if (undefined == header.zoomLevelHeaders || !(zoomLevelIndex in header.zoomLevelHeaders)) {
throw new FileFormatError("Given zoomLevelIndex not found in zoom level headers.");
}
const treeOffset = header.zoomLevelHeaders[zoomLevelIndex].indexOffset;
return this.streamData<BigZoomData>(startChrom, startBase, endChrom, endBase,
treeOffset, decodeZoomData);
}
/**
* Method containing all the shared functionality for reading BigWig and BigBed files.
*
* @param startChrom Starting chromosome
* @param startBase Starting base pair
* @param endChrom Ending chromosome
* @param endBase Ending base pair
* @param treeOffset Location of the R+ tree that stores the data we're interested.
* @param decodeFunction
*/
private async loadData<T>(startChrom: string, startBase: number, endChrom: string, endBase: number,
treeOffset: number, streamMode: boolean, decodeFunction: DecodeFunction<T>,
loadFunction: LoadFunction<T>): Promise<void> {
const header = await this.getHeader();
if (undefined == header.chromTree) {
throw new FileFormatError("No chromosome tree found in file header.");
}
const startChromIndex: number = header.chromTree.chromToId[startChrom];
const endChromIndex: number = header.chromTree.chromToId[endChrom];
if (undefined == startChromIndex) {
throw new DataMissingError(startChrom);
}
if (undefined == endChromIndex) {
throw new DataMissingError(endChrom);
}
// Load all leaf nodes within given chr / base bounds for the R+ tree used for actually storing the data.
const bufferedLoader = new BufferedDataLoader(this.dataLoader, this.bufferSize, streamMode);
const magic = new BinaryParser(await bufferedLoader.load(treeOffset, RPTREE_HEADER_SIZE)).getUInt();
if (IDX_MAGIC !== magic) {
throw new FileFormatError(`R+ tree not found at offset ${treeOffset}`);
}
const rootNodeOffset = treeOffset + RPTREE_HEADER_SIZE;
const leafNodes: Array<RPLeafNode> = await loadLeafNodesForRPNode(bufferedLoader, header.littleEndian, rootNodeOffset,
startChromIndex, startBase, endChromIndex, endBase);
// Iterate through filtered leaf nodes, load the data, and decode it
for (const leafNode of leafNodes) {
let leafData = new Uint8Array(await bufferedLoader.load(leafNode.dataOffset, leafNode.dataSize));
if (header.common!.uncompressBuffSize > 0) {
leafData = inflate(leafData);
}
let leafDecodedData = decodeFunction(leafData.buffer as ArrayBuffer, startChromIndex, startBase, endChromIndex,
endBase, header.chromTree.idToChrom);
loadFunction(leafDecodedData);
}
}
private async readData<T>(startChrom: string, startBase: number, endChrom: string, endBase: number,
treeOffset: number, decodeFunction: DecodeFunction<T>): Promise<Array<T>> {
const data: Array<T> = [];
const load: LoadFunction<T> = (d: T[]) => data.push(...d);
await this.loadData(startChrom, startBase, endChrom, endBase, treeOffset, false, decodeFunction, load);
return data;
};
private async streamData<T>(startChrom: string, startBase: number, endChrom: string, endBase: number,
treeOffset: number, decodeFunction: DecodeFunction<T>): Promise<Readable> {
const stream = new Readable({ objectMode: true, read() {} });
const load: LoadFunction<T> = (d: T[]) => {
d.forEach((el) => stream.push(el));
};
await this.loadData(startChrom, startBase, endChrom, endBase, treeOffset, true, decodeFunction, load);
stream.push(null);
return stream;
}
}
/**
* Recursively load a list of R+ tree leaf nodes for the given node (by file offset) within given chr / base bounds.
*
* @param bufferedLoader Buffered data loader used to load the node data.
* @param rpNodeOffset Offset for the start of the R+ tree node
* @param startChromIndex starting chromosome index used for filtering
* @param startBase starting base used for filtering
* @param endChromIndex ending chromosome index used for filtering
* @param startBase ending base used for filtering
* @returns List of simple representations of leaf nodes for the given node offset.
*/
async function loadLeafNodesForRPNode(bufferedLoader: BufferedDataLoader, littleEndian: boolean, rpNodeOffset: number, startChromIndex: number,
startBase: number, endChromIndex: number, endBase: number): Promise<Array<RPLeafNode>> {
const nodeHeaderData: ArrayBuffer = await bufferedLoader.load(rpNodeOffset, 4);
const nodeHeaderParser = new BinaryParser(nodeHeaderData, littleEndian);
const isLeaf = 1 === nodeHeaderParser.getByte();
nodeHeaderParser.position++; // Skip reserved space
const count = nodeHeaderParser.getUShort();
const nodeDataOffset = rpNodeOffset + 4;
const bytesRequired = count * (isLeaf ? RPTREE_NODE_LEAF_ITEM_SIZE : RPTREE_NODE_CHILD_ITEM_SIZE);
const nodeData: ArrayBuffer = await bufferedLoader.load(nodeDataOffset, bytesRequired);
let leafNodes: Array<RPLeafNode> = [];
const nodeDataParser = new BinaryParser(nodeData, littleEndian);
for (let i = 0; i < count; i++) {
const nodeStartChr = nodeDataParser.getInt();
const nodeStartBase = nodeDataParser.getInt();
const nodeEndChr = nodeDataParser.getInt();
const nodeEndBase = nodeDataParser.getInt();
// If this node overlaps with the chr / base range provided
const overlaps: boolean = ((endChromIndex > nodeStartChr) || (endChromIndex == nodeStartChr && endBase >= nodeStartBase)) &&
((startChromIndex < nodeEndChr) || (startChromIndex == nodeEndChr && startBase <= nodeEndBase));
if (isLeaf) {
const leafNode: RPLeafNode = {
startChrom: nodeStartChr,
startBase: nodeStartBase,
endChrom: nodeEndChr,
endBase: nodeEndBase,
dataOffset: nodeDataParser.getLong(),
dataSize: nodeDataParser.getLong()
};
if (overlaps) {
leafNodes.push(leafNode);
}
} else {
const childOffset = nodeDataParser.getLong();
if (overlaps) {
leafNodes.push(... await loadLeafNodesForRPNode(bufferedLoader, littleEndian, childOffset, startChromIndex, startBase, endChromIndex, endBase));
}
}
}
return leafNodes;
}
type DecodeFunction<T> = (data: ArrayBuffer, startChromIndex: number, startBase: number, endChromIndex: number,
endBase: number, chromDict: Array<string>) => Array<T>;
type LoadFunction<T> = (data: Array<T>) => void;
/**
* Extract useful data from sections of raw big binary bed data
*
* @param data Raw bed data
* @param filterStartChromIndex starting chromosome index used for filtering
* @param filterStartBase starting base used for filtering
* @param filterEndChromIndex ending chromosome index used for filtering
* @param filterEndBase ending base used for filtering
* @param chromDict dictionary of indices used by the file to chromosome names, conveniently stored as an array.
*/
export function decodeBedData(data: ArrayBuffer, filterStartChromIndex: number, filterStartBase: number, filterEndChromIndex: number,
filterEndBase: number, chromDict: Array<string>): Array<BigBedData> {
const decodedData: Array<BigBedData> = [];
const binaryParser = new BinaryParser(data);
const minSize = 3 * 4 + 1; // Minimum # of bytes required for a bed record
while (binaryParser.remLength() >= minSize) {
const chromIndex = binaryParser.getInt();
const chrom = chromDict[chromIndex];
const startBase = binaryParser.getInt();
const endBase = binaryParser.getInt();
const rest = binaryParser.getString();
if (chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && endBase < filterStartBase)) {
continue;
} else if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && startBase >= filterEndBase)) {
break;
}
const entry: BigBedData = {
chr: chrom,
start: startBase,
end: endBase,
}
let tokens = rest.split("\t");
if (tokens.length > 0) {
entry.name = tokens[0];
}
if (tokens.length > 1) {
entry.score = parseFloat(tokens[1]);
}
if (tokens.length > 2) {
entry.strand = tokens[2];
}
if (tokens.length > 3) {
entry.cdStart = parseInt(tokens[3]);
}
if (tokens.length > 4) {
entry.cdEnd = parseInt(tokens[4]);
}
if (tokens.length > 5 && tokens[5] !== "." && tokens[5] !== "0") {
let color: string;
if (tokens[5].includes(",")) {
color = tokens[5].startsWith("rgb") ? tokens[5] : "rgb(" + tokens[5] + ")";
} else {
color = tokens[5];
}
entry.color = color;
}
if (tokens.length > 8) {
const exonCount = parseInt(tokens[6]);
const exonSizes = tokens[7].split(',');
const exonStarts = tokens[8].split(',');
const exons: Array<BigBedExon> = [];
for (var i = 0; i < exonCount; i++) {
const eStart = startBase + parseInt(exonStarts[i]);
const eEnd = eStart + parseInt(exonSizes[i]);
exons.push({ start: eStart, end: eEnd });
}
entry.exons = exons;
}
decodedData.push(entry);
}
return decodedData;
}
/**
* Extract useful data from sections of raw big binary bed data
*
* @param data Raw bed data
* @param filterStartChromIndex starting chromosome index used for filtering
* @param filterStartBase starting base used for filtering
* @param filterEndChromIndex ending chromosome index used for filtering
* @param filterEndBase ending base used for filtering
* @param chromDict dictionary of indices used by the file to chromosome names, conveniently stored as an array.
*/
function decodeBigBedDataNarrowPeak(data: ArrayBuffer, filterStartChromIndex: number, filterStartBase: number, filterEndChromIndex: number,
filterEndBase: number, chromDict: Array<string>): Array<BigBedDataNarrowPeak> {
const decodedData: Array<BigBedDataNarrowPeak> = [];
const binaryParser = new BinaryParser(data);
const minSize = 3 * 4 + 1; // Minimum # of bytes required for a bed record
while (binaryParser.remLength() >= minSize) {
const chromIndex = binaryParser.getInt();
const chrom = chromDict[chromIndex];
const startBase = binaryParser.getInt();
const endBase = binaryParser.getInt();
const rest = binaryParser.getString();
if (chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && endBase < filterStartBase)) {
continue;
} else if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && startBase >= filterEndBase)) {
break;
}
const entry: BigBedDataNarrowPeak = {
chr: chrom,
start: startBase,
end: endBase
}
let tokens = rest.split("\t");
if (tokens.length > 0) {
entry.name = tokens[0];
}
if (tokens.length > 1) {
entry.score = parseFloat(tokens[1]);
}
if (tokens.length > 2) {
entry.strand = tokens[2];
}
if (tokens.length > 3) {
entry.signalValue = parseInt(tokens[3]);
}
if (tokens.length > 4) {
entry.pValue = parseInt(tokens[4]);
}
if (tokens.length > 5) {
entry.qValue = parseInt(tokens[5]);
}
if (tokens.length > 6) {
entry.peak = parseInt(tokens[6]);
}
decodedData.push(entry);
}
return decodedData;
}
/**
* Extract useful data from sections of raw big binary bed data
*
* @param data Raw bed data
* @param filterStartChromIndex starting chromosome index used for filtering
* @param filterStartBase starting base used for filtering
* @param filterEndChromIndex ending chromosome index used for filtering
* @param filterEndBase ending base used for filtering
* @param chromDict dictionary of indices used by the file to chromosome names, conveniently stored as an array.
*/
function decodeBigBedDataBroadPeak(data: ArrayBuffer, filterStartChromIndex: number, filterStartBase: number, filterEndChromIndex: number,
filterEndBase: number, chromDict: Array<string>): Array<BigBedDataBroadPeak> {
const decodedData: Array<BigBedDataBroadPeak> = [];
const binaryParser = new BinaryParser(data);
const minSize = 3 * 4 + 1; // Minimum # of bytes required for a bed record
while (binaryParser.remLength() >= minSize) {
const chromIndex = binaryParser.getInt();
const chrom = chromDict[chromIndex];
const startBase = binaryParser.getInt();
const endBase = binaryParser.getInt();
const rest = binaryParser.getString();
if (chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && endBase < filterStartBase)) {
continue;
} else if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && startBase >= filterEndBase)) {
break;
}
const entry: BigBedDataBroadPeak = {
chr: chrom,
start: startBase,
end: endBase
}
let tokens = rest.split("\t");
if (tokens.length > 0) {
entry.name = tokens[0];
}
if (tokens.length > 1) {
entry.score = parseFloat(tokens[1]);
}
if (tokens.length > 2) {
entry.strand = tokens[2];
}
if (tokens.length > 3) {
entry.signalValue = parseInt(tokens[3]);
}
if (tokens.length > 4) {
entry.pValue = parseInt(tokens[4]);
}
if (tokens.length > 5) {
entry.qValue = parseInt(tokens[5]);
}
decodedData.push(entry);
}
return decodedData;
}
/**
* Extract useful data from sections of raw big binary bed data
*
* @param data Raw bed data
* @param filterStartChromIndex starting chromosome index used for filtering
* @param filterStartBase starting base used for filtering
* @param filterEndChromIndex ending chromosome index used for filtering
* @param filterEndBase ending base used for filtering
* @param chromDict dictionary of indices used by the file to chromosome names, conveniently stored as an array.
*/
function decodeBigBedDataRNAElement(data: ArrayBuffer, filterStartChromIndex: number, filterStartBase: number, filterEndChromIndex: number,
filterEndBase: number, chromDict: Array<string>): Array<BigBedDataRNAElement> {
const decodedData: Array<BigBedDataRNAElement> = [];
const binaryParser = new BinaryParser(data);
const minSize = 3 * 4 + 1; // Minimum # of bytes required for a bed record
while (binaryParser.remLength() >= minSize) {
const chromIndex = binaryParser.getInt();
const chrom = chromDict[chromIndex];
const startBase = binaryParser.getInt();
const endBase = binaryParser.getInt();
const rest = binaryParser.getString();
if (chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && endBase < filterStartBase)) {
continue;
} else if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && startBase >= filterEndBase)) {
break;
}
const entry: BigBedDataRNAElement = {
chr: chrom,
start: startBase,
end: endBase
}
let tokens = rest.split("\t");
if (tokens.length > 0) {
entry.name = tokens[0];
}
if (tokens.length > 1) {
entry.score = parseFloat(tokens[1]);
}
if (tokens.length > 2) {
entry.strand = tokens[2];
}
if (tokens.length > 3) {
entry.level = parseFloat(tokens[3]);
}
if (tokens.length > 4) {
entry.signif = parseFloat(tokens[4]);
}
if (tokens.length > 5) {
entry.score2 = parseFloat(tokens[5]);
}
decodedData.push(entry);
}
return decodedData;
}
/**
* Extract useful data from sections of raw big binary bed data
*
* @param data Raw bed data
* @param filterStartChromIndex starting chromosome index used for filtering
* @param filterStartBase starting base used for filtering
* @param filterEndChromIndex ending chromosome index used for filtering
* @param filterEndBase ending base used for filtering
* @param chromDict dictionary of indices used by the file to chromosome names, conveniently stored as an array.
*/
function decodeBigBedDataMethyl(data: ArrayBuffer, filterStartChromIndex: number, filterStartBase: number, filterEndChromIndex: number,
filterEndBase: number, chromDict: Array<string>): Array<BigBedDataMethyl> {
const decodedData: Array<BigBedDataMethyl> = [];
const binaryParser = new BinaryParser(data);
const minSize = 3 * 4 + 1; // Minimum # of bytes required for a bed record
while (binaryParser.remLength() >= minSize) {
const chromIndex = binaryParser.getInt();
const chrom = chromDict[chromIndex];
const startBase = binaryParser.getInt();
const endBase = binaryParser.getInt();
const rest = binaryParser.getString();
if (chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && endBase < filterStartBase)) {
continue;
} else if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && startBase >= filterEndBase)) {
break;
}
const entry: BigBedDataMethyl = {
chr: chrom,
start: startBase,
end: endBase
}
let tokens = rest.split("\t");
if (tokens.length > 0) {
entry.name = tokens[0];
}
if (tokens.length > 1) {
entry.score = parseInt(tokens[1]);
}
if (tokens.length > 2) {
entry.strand = tokens[2];
}
if (tokens.length > 3) {
entry.thickStart = parseInt(tokens[3]);
}
if (tokens.length > 4) {
entry.thickEnd = parseInt(tokens[4]);
}
if (tokens.length > 5) {
entry.reserved = parseInt(tokens[5]);
}
if (tokens.length > 6) {
entry.readCount = parseInt(tokens[6]);
}
if (tokens.length > 7) {
entry.percentMeth = parseInt(tokens[7]);
}
decodedData.push(entry);
}
return decodedData;
}
/**
* Extract useful data from sections of raw big binary bed data
*
* @param data Raw bed data
* @param filterStartChromIndex starting chromosome index used for filtering
* @param filterStartBase starting base used for filtering
* @param filterEndChromIndex ending chromosome index used for filtering
* @param filterEndBase ending base used for filtering
* @param chromDict dictionary of indices used by the file to chromosome names, conveniently stored as an array.
*/
function decodeBigBedDataTssPeak(data: ArrayBuffer, filterStartChromIndex: number, filterStartBase: number, filterEndChromIndex: number,
filterEndBase: number, chromDict: Array<string>): Array<BigBedDataTssPeak> {
const decodedData: Array<BigBedDataTssPeak> = [];
const binaryParser = new BinaryParser(data);
const minSize = 3 * 4 + 1; // Minimum # of bytes required for a bed record
while (binaryParser.remLength() >= minSize) {
const chromIndex = binaryParser.getInt();
const chrom = chromDict[chromIndex];
const startBase = binaryParser.getInt();
const endBase = binaryParser.getInt();
const rest = binaryParser.getString();
if (chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && endBase < filterStartBase)) {
continue;
} else if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && startBase >= filterEndBase)) {
break;
}
const entry: BigBedDataTssPeak = {
chr: chrom,
start: startBase,
end: endBase
}
let tokens = rest.split("\t");
if (tokens.length > 0) {
entry.name = tokens[0];
}
if (tokens.length > 1) {
entry.score = parseFloat(tokens[1]);
}
if (tokens.length > 2) {
entry.strand = tokens[2];
}
if (tokens.length > 3) {
entry.count = parseFloat(tokens[3]);
}
if (tokens.length > 4) {
entry.gene_id = tokens[4];
}
if (tokens.length > 5) {
entry.gene_name = tokens[5];
}
if (tokens.length > 6) {
entry.tss_id = tokens[6];
}
if (tokens.length > 7) {
entry.peak_cov = tokens[7];
}
decodedData.push(entry);
}
return decodedData;
}
/**
* Extract useful data from sections of raw big binary bed data
*
* @param data Raw bed data
* @param filterStartChromIndex starting chromosome index used for filtering
* @param filterStartBase starting base used for filtering
* @param filterEndChromIndex ending chromosome index used for filtering
* @param filterEndBase ending base used for filtering
* @param chromDict dictionary of indices used by the file to chromosome names, conveniently stored as an array.
*/
function decodeBigBedDataIdrPeak(data: ArrayBuffer, filterStartChromIndex: number, filterStartBase: number, filterEndChromIndex: number,
filterEndBase: number, chromDict: Array<string>): Array<BigBedDataIdrPeak> {
const decodedData: Array<BigBedDataIdrPeak> = [];
const binaryParser = new BinaryParser(data);
const minSize = 3 * 4 + 1; // Minimum # of bytes required for a bed record
while (binaryParser.remLength() >= minSize) {
const chromIndex = binaryParser.getInt();
const chrom = chromDict[chromIndex];
const startBase = binaryParser.getInt();
const endBase = binaryParser.getInt();
const rest = binaryParser.getString();
if (chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && endBase < filterStartBase)) {
continue;
} else if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && startBase >= filterEndBase)) {
break;
}
const entry: BigBedDataIdrPeak = {
chr: chrom,
start: startBase,
end: endBase
}
let tokens = rest.split("\t");
if (tokens.length > 0) {
entry.name = tokens[0];
}
if (tokens.length > 1) {
entry.score = parseInt(tokens[1]);
}
if (tokens.length > 2) {
entry.strand = tokens[2];
}
if (tokens.length > 3) {
entry.localIDR = parseFloat(tokens[3]);
}
if (tokens.length > 4) {
entry.globalIDR = parseFloat(tokens[4]);
}
if (tokens.length > 5) {
entry.rep1_chromStart = parseInt(tokens[5]);
}
if (tokens.length > 6) {
entry.rep1_chromEnd= parseInt(tokens[6]);
}
if (tokens.length > 7) {
entry.rep1_count = parseFloat(tokens[7]);
}
if (tokens.length > 8) {
entry.rep2_chromStart = parseInt(tokens[8]);
}
if (tokens.length > 9) {
entry.rep2_chromEnd = parseInt(tokens[9]);
}
if (tokens.length > 10) {
entry.rep2_chromEnd = parseFloat(tokens[10]);
}
decodedData.push(entry);
}
return decodedData;
}
/**
* Extract useful data from sections of raw big binary bed data
*
* @param data Raw bed data
* @param filterStartChromIndex starting chromosome index used for filtering
* @param filterStartBase starting base used for filtering
* @param filterEndChromIndex ending chromosome index used for filtering
* @param filterEndBase ending base used for filtering
* @param chromDict dictionary of indices used by the file to chromosome names, conveniently stored as an array.
*/
function decodeBigBedDataIdrRankedPeak(data: ArrayBuffer, filterStartChromIndex: number, filterStartBase: number, filterEndChromIndex: number,
filterEndBase: number, chromDict: Array<string>): Array<BigBedDataIdrRankedPeak> {
const decodedData: Array<BigBedDataIdrRankedPeak> = [];
const binaryParser = new BinaryParser(data);
const minSize = 3 * 4 + 1; // Minimum # of bytes required for a bed record
while (binaryParser.remLength() >= minSize) {
const chromIndex = binaryParser.getInt();
const chrom = chromDict[chromIndex];
const startBase = binaryParser.getInt();
const endBase = binaryParser.getInt();
const rest = binaryParser.getString();
if (chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && endBase < filterStartBase)) {
continue;
} else if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && startBase >= filterEndBase)) {
break;
}
const entry: BigBedDataIdrRankedPeak = {
chr: chrom,
start: startBase,
end: endBase
}
let tokens = rest.split("\t");
if (tokens.length > 0) {
entry.name = tokens[0];
}
if (tokens.length > 1) {
entry.score = parseInt(tokens[1]);
}
if (tokens.length > 2) {
entry.strand = tokens[2];
}
if (tokens.length > 3) {
entry.signalValue = parseFloat(tokens[3]);
}
if (tokens.length > 4) {
entry.pValue = parseFloat(tokens[4]);
}
if (tokens.length > 5) {
entry.qValue = parseFloat(tokens[5]);
}
if (tokens.length > 6) {
entry.summit= parseInt(tokens[6]);
}
if (tokens.length > 7) {
entry.localIDR = parseFloat(tokens[7]);
}
if (tokens.length > 8) {
entry.globalIDR = parseInt(tokens[8]);
}
if (tokens.length > 9) {
entry.chromStart1 = parseInt(tokens[9]);
}
if (tokens.length > 10) {
entry.chromEnd1 = parseInt(tokens[10]);
}
if (tokens.length > 11) {
entry.signalValue1 = parseFloat(tokens[11]);
}
if (tokens.length > 12) {
entry.summit1 = parseFloat(tokens[12]);
}
if (tokens.length > 13) {
entry.chromStart2 = parseInt(tokens[13]);
}
if (tokens.length > 14) {
entry.chromEnd2 = parseInt(tokens[14]);
}
if (tokens.length > 15) {
entry.signalValue2 = parseFloat(tokens[15]);
}
if (tokens.length > 16) {
entry.summit2 = parseFloat(tokens[16]);
}
decodedData.push(entry);
}
return decodedData;
}
/**
* Extract useful data from sections of raw big binary unzoomed wig data
*
* @param data Raw unzoomed wig data
* @param filterStartChromIndex starting chromosome index used for filtering
* @param filterStartBase starting base used for filtering
* @param filterEndChromIndex ending chromosome index used for filtering
* @param filterEndBase ending base used for filtering
* @param chromDict dictionary of indices used by the file to chromosome names, conveniently stored as an array.
*/
function decodeWigData(data: ArrayBuffer, filterStartChromIndex: number, filterStartBase: number, filterEndChromIndex: number,
filterEndBase: number, chromDict: Array<string>): Array<BigWigData> {
const decodedData: Array<BigWigData> = [];
const binaryParser = new BinaryParser(data);
const chromIndex = binaryParser.getInt();
const chrom = chromDict[chromIndex];
let startBase = binaryParser.getInt();
let endBase = binaryParser.getInt();
const itemStep = binaryParser.getInt();
const itemSpan = binaryParser.getInt();
const type = binaryParser.getByte();
const reserved = binaryParser.getByte();
let itemCount = binaryParser.getUShort();
if (chromIndex < filterStartChromIndex || chromIndex > filterEndChromIndex) {
return decodedData;
}
while (itemCount-- > 0) {
let value: number;
if (1 === type) {
// Data is stored in Bed Graph format
startBase = binaryParser.getInt();
endBase = binaryParser.getInt();
value = binaryParser.getFloat();
} else if (2 === type) {
// Data is stored in Variable Step format
startBase = binaryParser.getInt();
value = binaryParser.getFloat();
endBase = startBase + itemSpan;
} else {
// Data is stored in Fixed Step format.
value = binaryParser.getFloat();
endBase = startBase + itemSpan;
}
if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && startBase >= filterEndBase)) {
break; // past the end of the range; exit
} else if (!(chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && endBase < filterStartBase))) {
decodedData.push({
chr: chrom,
start: startBase,
end: endBase,
value: value
}); // this is within the range (i.e. not before the first requested base); add this datapoint
}
if (1 !== type && 2 !== type) {
// data is stored in Fixed Step format
// only increment the start base once the last entry has been pushed
startBase += itemStep;
}
}
return decodedData;
}
/**
* Extract useful data from sections of raw big binary zoom data
*
* @param data Raw zoomed wig data
* @param filterStartChromIndex starting chromosome index used for filtering
* @param filterStartBase starting base used for filtering
* @param filterEndChromIndex ending chromosome index used for filtering
* @param filterEndBase ending base used for filtering
* @param chromDict dictionary of indices used by the file to chromosome names, conveniently stored as an array.
*/
function decodeZoomData(data: ArrayBuffer, filterStartChromIndex: number, filterStartBase: number, filterEndChromIndex: number,
filterEndBase: number, chromDict: Array<string>): Array<BigZoomData> {
const decodedData: Array<BigZoomData> = [];
const binaryParser = new BinaryParser(data);
const minSize = 8 * 4; // Minimum # of bytes required for a zoom record
while (binaryParser.remLength() > minSize) {
const chromIndex = binaryParser.getInt();
const decodedZoomData: BigZoomData = {
chr: chromDict[chromIndex],
start: binaryParser.getInt(),
end: binaryParser.getInt(),
validCount: binaryParser.getInt(),
minVal: binaryParser.getFloat(),
maxVal: binaryParser.getFloat(),
sumData: binaryParser.getFloat(),
sumSquares: binaryParser.getFloat()
};
if (chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && decodedZoomData.end < filterStartBase)) {
continue;
} else if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && decodedZoomData.start >= filterEndBase)) {
break;
}
decodedData.push(decodedZoomData);
}
return decodedData;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment