Skip to content

Instantly share code, notes, and snippets.

@bioinfornatics
Created March 1, 2012 00:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bioinfornatics/1946288 to your computer and use it in GitHub Desktop.
Save bioinfornatics/1946288 to your computer and use it in GitHub Desktop.
/**
* The module csv is a set of function to parse many format using a delimitter as csv file
* Supported format:
* - .mat matrix file
* - .bed UCSC file
* For parse a .csv file use std.csv
*/
module bed;
import std.conv;
import std.stdio;
import std.csv;
import std.traits;
import std.file;
import std.array;
import std.algorithm;
import std.range;
import std.string;
import std.exception;
/**
* loadMatrixFile
* load a matrix from a file.
* Params:
* filePath = path to file who contain matrix
* separator = set delimiter used into the file for separate each column default it is tab
* Returns:
* A 2D array
*/
T[][] matrixReader( T )( string filePath, string separator = "\t" ){
File matrixFile = File( filePath, "r");
T[][] matrix;
size_t length = 10;
size_t counter = 0;
matrix.length = 10;
foreach( line; matrixFile.byLine() ){
if( length == counter ){
length += 10;
matrix.length = length;
}
matrix[counter] = array( map!(to!T)( filter!"!a.empty"(line.split( separator ) ) ) );// Use filter like split bug and do not merge consecutive delimiter
counter++;
}
matrix.length = counter;
return matrix;
}
struct BedData3{
string chrom; // 0
size_t chromStart; // 1
size_t chromEnd; // 2
string name; // 3
}
struct BedData4{
string chrom; // 0
size_t chromStart; // 1
size_t chromEnd; // 2
string name; // 3
size_t score; // 4
}
struct BedData5{
string chrom; // 0
size_t chromStart; // 1
size_t chromEnd; // 2
string name; // 3
size_t score; // 4
char strand; // 5
}
struct BedData6{
string chrom; // 0
size_t chromStart; // 1
size_t chromEnd; // 2
string name; // 3
size_t score; // 4
char strand; // 5
size_t thickStart; // 6
}
struct BedData7{
string chrom; // 0
size_t chromStart; // 1
size_t chromEnd; // 2
string name; // 3
size_t score; // 4
char strand; // 5
size_t thickStart; // 6
size_t thickEnd; // 7
}
struct BedData8{
string chrom; // 0
size_t chromStart; // 1
size_t chromEnd; // 2
string name; // 3
size_t score; // 4
char strand; // 5
size_t thickStart; // 6
size_t thickEnd; // 7
size_t[3] itemRgb; // 8
}
struct BedData9{
string chrom; // 0
size_t chromStart; // 1
size_t chromEnd; // 2
string name; // 3
size_t score; // 4
char strand; // 5
size_t thickStart; // 6
size_t thickEnd; // 7
size_t[3] itemRgb; // 8
size_t blockCount; // 9
}
struct BedData10{
string chrom; // 0
size_t chromStart; // 1
size_t chromEnd; // 2
string name; // 3
size_t score; // 4
char strand; // 5
size_t thickStart; // 6
size_t thickEnd; // 7
size_t[3] itemRgb; // 8
size_t blockCount; // 9
size_t blockSizes; // 10
}
struct BedData11{
string chrom; // 0
size_t chromStart; // 1
size_t chromEnd; // 2
string name; // 3
size_t score; // 4
char strand; // 5
size_t thickStart; // 6
size_t thickEnd; // 7
size_t[3] itemRgb; // 8
size_t blockCount; // 9
size_t blockSizes; // 10
size_t blockStarts; // 11
}
struct BedMetadata{
string name;
string description;
size_t visibility;
string itemRgb;
size_t browserStart;
size_t browserEnd;
string chromosome;
string hide;
string toString(){
string result = "";
if( chromosome != "" && browserStart != 0 && browserEnd != 0 )
result ~= "browser position %s:%d-%d\n".format( chromosome, browserStart, browserEnd );
if( hide != "" )
result ~= "browser hide %s\n".format( hide );
if( name != "" && description != "" && visibility != 0 )
result ~= "track name=%s description=%s visibility=%d\n".format( name, description, visibility );
if( itemRgb != "" )
result ~= "itemRgb=\"%s\"".format( itemRgb );
return result;
}
}
struct TrackLine{
string name;
string description;
string type;
size_t visibility;
size_t[3] color;
string itemRgb;
size_t[3] colorByStrand;
size_t useScore;
string group;
string db;
size_t offset;
size_t maxItems;
string url;
string htmlUrl;
string bigDataUrl;
}
struct Bed( T ){
BedMetadata metadata;
TrackLine trackLine;
T[] bedDataList;
}
TrackLine trackLineReader( in char[] trackLine ){
TrackLine result;
size_t nameStart = line.countUntil("name=\"");
if( nameStart != -1 ){
size_t nameEnd = nameStart + line[nameStart + 1 .. $ ].countUntil('"');
result.name = line[nameStart + 1 .. nameEnd];
}
size_t descriptionStart = line.countUntil("description=\"");
if( descriptionStart != -1 ){
size_t descriptionEnd = descriptionStart + line[descriptionStart + 1 .. $ ].countUntil('"');
result.description = line[descriptionStart + 1 .. descriptionEnd];
}
size_t typeStart = line.countUntil("type=\"");
if( typeStart != -1 ){
size_t typeEnd = typeStart + line[typeStart + 1 .. $ ].countUntil('"');
result.type = line[typeStart + 1 .. typeEnd];
}
size_t visibilityStart = line.countUntil("visibility=");
if( typeStart != -1 ){
size_t visibilityEnd = visibilityStart + line[visibilityStart + 1 .. $].countUntil(' ');
result.type = line[typeStart + 1 .. typeEnd];
}
size_t colorStart = line.countUntil("color=\"");
if( typeStart != -1 ){
size_t colorEnd = colorStart + line[colorStart + 1 .. $ ].countUntil('"');
result.type = line[typeStart + 1 .. typeEnd];
}
size_t itemRgbStart = line.countUntil("itemRgb=\"");
if( typeStart != -1 ){
size_t itemRgbEnd = itemRgbStart + line[itemRgbStart + 1 .. $ ].countUntil('"');
result.type = line[typeStart + 1 .. typeEnd];
}
size_t colorByStrandStart = line.countUntil("colorByStrand=\"");
if( typeStart != -1 ){
size_t colorByStrandEnd = colorByStrand + line[colorByStrand + 1 .. $ ].countUntil('"');
result.type = line[typeStart + 1 .. typeEnd];
}
size_t useScoreStart = line.countUntil("useScore=");
if( typeStart != -1 ){
size_t useScoreEnd = nameStart + line[nameStart + 1 .. $ ].countUntil(' ');
result.type = line[typeStart + 1 .. typeEnd];
}
size_t groupStart = line.countUntil("group=\"");
if( typeStart != -1 ){
size_t groupEnd = groupStart + line[groupStart + 1 .. $ ].countUntil('"');
result.type = line[typeStart + 1 .. typeEnd];
}
size_t dbStart = line.countUntil("db=\"");
if( typeStart != -1 ){
size_t dbEnd = dbStart + line[dbStart + 1 .. $ ].countUntil('"');
result.type = line[typeStart + 1 .. typeEnd];
}
size_t offsetStart = line.countUntil("offset=");
if( typeStart != -1 ){
size_t offsetEnd = offsetStart + line[offsetStart + 1 .. $ ].countUntil(' ');
result.type = line[typeStart + 1 .. typeEnd];
}
size_t maxItemsStart = line.countUntil("maxItems=");
if( typeStart != -1 ){
size_t maxItemsEnd = maxItemsStart + line[maxItemsStart + 1 .. $ ].countUntil(' ');
result.type = line[typeStart + 1 .. typeEnd];
}
size_t urlStart = line.countUntil("url=\"");
if( typeStart != -1 ){
size_t urlEnd = urlStart + line[urlStart + 1 .. $ ].countUntil('"');
result.type = line[typeStart + 1 .. typeEnd];
}
size_t htmlUrlStart = line.countUntil("htmlUrl=\"");
if( typeStart != -1 ){
size_t htmlUrlEnd = htmlUrlStart + line[htmlUrlStart + 1 .. $ ].countUntil('"');
result.type = line[typeStart + 1 .. typeEnd];
}
size_t bigDataUrlStart = line.countUntil("bigDataUrl=\"");
if( typeStart != -1 ){
size_t bigDataUrlEnd = nameStart + line[bigDataUrlStart + 1 .. $ ].countUntil('"');
result.type = line[typeStart + 1 .. typeEnd];
}
return result;
}
auto bedReader( T = BedData3 )( in char[] filePath, char delimiter='\t' ){
if( !filePath.exists )
throw new FileException( "File %s is do not exist".format(filePath) );
else if( !filePath.isFile )
throw new FileException( "File %s is not a file".format(filePath) );
File bedFile = File( to!string(filePath), "r" );
BedMetadata metadata;
TrackLine trackLine;
Bed!(ReturnType!(csvReader!T())[]) bedInstance;
const string browserToken1 = "browser position";
const string browserToken2 = "browser hide";
const string trackToken = "track ";
foreach( char[] line; bedFile.byLine() ){
if( line.startsWith( '#' ) ) // comment
continue;
else if( line.empty ) // empty line
continue;
else if( line.startsWith(browserToken1) ){
size_t colonIndex = line.countUntil(':');
size_t minusIndex = line[colonIndex .. $].countUntil('-');
string reversed = to!string( retro( line[browserToken1.length .. colonIndex] ) );
size_t spaceIndexBeforeChrom = reversed.countUntil(' ');
size_t spaceIndexAfterPosition = line[minusIndex..$].countUntil(' ');
size_t endPositionIndex = 0;
if(spaceIndexAfterPosition == -1)
endPositionIndex = line.length;
else
endPositionIndex = colonIndex + minusIndex + spaceIndexAfterPosition;
metadata.chromosome = to!string( retro(reversed[0 .. spaceIndexBeforeChrom]) );
metadata.browserStart = to!size_t(line[colonIndex + 1 .. colonIndex + minusIndex]);
metadata.browserEnd = to!size_t(line[colonIndex + minusIndex + 1 .. endPositionIndex]);
}
else if( line.startsWith(browserToken2) ){
if(line.length > browserToken2.length + 1)
data.hide = line[browserToken2.length + 1 .. $].idup;
else
throw new Exception("Malformed metadata line");
}
else if( line.startsWith(trackToken) ){
trackLine = trackLineReader( line );
}
else{ // data in csv format
auto records = csvReader!T(line, delimiter);
bedInstance ~= records;
}
}
bedInstance.metadata = metadata;
bedInstance.trackLine = trackLine;
return bedInstance;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment