Created
March 1, 2012 00:50
-
-
Save bioinfornatics/1946288 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* The module csv is a set of function to parse many format using a delimitter as csv file | |
* Supported format: | |
* - .mat matrix file | |
* - .bed UCSC file | |
* For parse a .csv file use std.csv | |
*/ | |
module bed; | |
import std.conv; | |
import std.stdio; | |
import std.csv; | |
import std.traits; | |
import std.file; | |
import std.array; | |
import std.algorithm; | |
import std.range; | |
import std.string; | |
import std.exception; | |
/** | |
* loadMatrixFile | |
* load a matrix from a file. | |
* Params: | |
* filePath = path to file who contain matrix | |
* separator = set delimiter used into the file for separate each column default it is tab | |
* Returns: | |
* A 2D array | |
*/ | |
T[][] matrixReader( T )( string filePath, string separator = "\t" ){ | |
File matrixFile = File( filePath, "r"); | |
T[][] matrix; | |
size_t length = 10; | |
size_t counter = 0; | |
matrix.length = 10; | |
foreach( line; matrixFile.byLine() ){ | |
if( length == counter ){ | |
length += 10; | |
matrix.length = length; | |
} | |
matrix[counter] = array( map!(to!T)( filter!"!a.empty"(line.split( separator ) ) ) );// Use filter like split bug and do not merge consecutive delimiter | |
counter++; | |
} | |
matrix.length = counter; | |
return matrix; | |
} | |
struct BedData3{ | |
string chrom; // 0 | |
size_t chromStart; // 1 | |
size_t chromEnd; // 2 | |
string name; // 3 | |
} | |
struct BedData4{ | |
string chrom; // 0 | |
size_t chromStart; // 1 | |
size_t chromEnd; // 2 | |
string name; // 3 | |
size_t score; // 4 | |
} | |
struct BedData5{ | |
string chrom; // 0 | |
size_t chromStart; // 1 | |
size_t chromEnd; // 2 | |
string name; // 3 | |
size_t score; // 4 | |
char strand; // 5 | |
} | |
struct BedData6{ | |
string chrom; // 0 | |
size_t chromStart; // 1 | |
size_t chromEnd; // 2 | |
string name; // 3 | |
size_t score; // 4 | |
char strand; // 5 | |
size_t thickStart; // 6 | |
} | |
struct BedData7{ | |
string chrom; // 0 | |
size_t chromStart; // 1 | |
size_t chromEnd; // 2 | |
string name; // 3 | |
size_t score; // 4 | |
char strand; // 5 | |
size_t thickStart; // 6 | |
size_t thickEnd; // 7 | |
} | |
struct BedData8{ | |
string chrom; // 0 | |
size_t chromStart; // 1 | |
size_t chromEnd; // 2 | |
string name; // 3 | |
size_t score; // 4 | |
char strand; // 5 | |
size_t thickStart; // 6 | |
size_t thickEnd; // 7 | |
size_t[3] itemRgb; // 8 | |
} | |
struct BedData9{ | |
string chrom; // 0 | |
size_t chromStart; // 1 | |
size_t chromEnd; // 2 | |
string name; // 3 | |
size_t score; // 4 | |
char strand; // 5 | |
size_t thickStart; // 6 | |
size_t thickEnd; // 7 | |
size_t[3] itemRgb; // 8 | |
size_t blockCount; // 9 | |
} | |
struct BedData10{ | |
string chrom; // 0 | |
size_t chromStart; // 1 | |
size_t chromEnd; // 2 | |
string name; // 3 | |
size_t score; // 4 | |
char strand; // 5 | |
size_t thickStart; // 6 | |
size_t thickEnd; // 7 | |
size_t[3] itemRgb; // 8 | |
size_t blockCount; // 9 | |
size_t blockSizes; // 10 | |
} | |
struct BedData11{ | |
string chrom; // 0 | |
size_t chromStart; // 1 | |
size_t chromEnd; // 2 | |
string name; // 3 | |
size_t score; // 4 | |
char strand; // 5 | |
size_t thickStart; // 6 | |
size_t thickEnd; // 7 | |
size_t[3] itemRgb; // 8 | |
size_t blockCount; // 9 | |
size_t blockSizes; // 10 | |
size_t blockStarts; // 11 | |
} | |
struct BedMetadata{ | |
string name; | |
string description; | |
size_t visibility; | |
string itemRgb; | |
size_t browserStart; | |
size_t browserEnd; | |
string chromosome; | |
string hide; | |
string toString(){ | |
string result = ""; | |
if( chromosome != "" && browserStart != 0 && browserEnd != 0 ) | |
result ~= "browser position %s:%d-%d\n".format( chromosome, browserStart, browserEnd ); | |
if( hide != "" ) | |
result ~= "browser hide %s\n".format( hide ); | |
if( name != "" && description != "" && visibility != 0 ) | |
result ~= "track name=%s description=%s visibility=%d\n".format( name, description, visibility ); | |
if( itemRgb != "" ) | |
result ~= "itemRgb=\"%s\"".format( itemRgb ); | |
return result; | |
} | |
} | |
struct TrackLine{ | |
string name; | |
string description; | |
string type; | |
size_t visibility; | |
size_t[3] color; | |
string itemRgb; | |
size_t[3] colorByStrand; | |
size_t useScore; | |
string group; | |
string db; | |
size_t offset; | |
size_t maxItems; | |
string url; | |
string htmlUrl; | |
string bigDataUrl; | |
} | |
struct Bed( T ){ | |
BedMetadata metadata; | |
TrackLine trackLine; | |
T[] bedDataList; | |
} | |
TrackLine trackLineReader( in char[] trackLine ){ | |
TrackLine result; | |
size_t nameStart = line.countUntil("name=\""); | |
if( nameStart != -1 ){ | |
size_t nameEnd = nameStart + line[nameStart + 1 .. $ ].countUntil('"'); | |
result.name = line[nameStart + 1 .. nameEnd]; | |
} | |
size_t descriptionStart = line.countUntil("description=\""); | |
if( descriptionStart != -1 ){ | |
size_t descriptionEnd = descriptionStart + line[descriptionStart + 1 .. $ ].countUntil('"'); | |
result.description = line[descriptionStart + 1 .. descriptionEnd]; | |
} | |
size_t typeStart = line.countUntil("type=\""); | |
if( typeStart != -1 ){ | |
size_t typeEnd = typeStart + line[typeStart + 1 .. $ ].countUntil('"'); | |
result.type = line[typeStart + 1 .. typeEnd]; | |
} | |
size_t visibilityStart = line.countUntil("visibility="); | |
if( typeStart != -1 ){ | |
size_t visibilityEnd = visibilityStart + line[visibilityStart + 1 .. $].countUntil(' '); | |
result.type = line[typeStart + 1 .. typeEnd]; | |
} | |
size_t colorStart = line.countUntil("color=\""); | |
if( typeStart != -1 ){ | |
size_t colorEnd = colorStart + line[colorStart + 1 .. $ ].countUntil('"'); | |
result.type = line[typeStart + 1 .. typeEnd]; | |
} | |
size_t itemRgbStart = line.countUntil("itemRgb=\""); | |
if( typeStart != -1 ){ | |
size_t itemRgbEnd = itemRgbStart + line[itemRgbStart + 1 .. $ ].countUntil('"'); | |
result.type = line[typeStart + 1 .. typeEnd]; | |
} | |
size_t colorByStrandStart = line.countUntil("colorByStrand=\""); | |
if( typeStart != -1 ){ | |
size_t colorByStrandEnd = colorByStrand + line[colorByStrand + 1 .. $ ].countUntil('"'); | |
result.type = line[typeStart + 1 .. typeEnd]; | |
} | |
size_t useScoreStart = line.countUntil("useScore="); | |
if( typeStart != -1 ){ | |
size_t useScoreEnd = nameStart + line[nameStart + 1 .. $ ].countUntil(' '); | |
result.type = line[typeStart + 1 .. typeEnd]; | |
} | |
size_t groupStart = line.countUntil("group=\""); | |
if( typeStart != -1 ){ | |
size_t groupEnd = groupStart + line[groupStart + 1 .. $ ].countUntil('"'); | |
result.type = line[typeStart + 1 .. typeEnd]; | |
} | |
size_t dbStart = line.countUntil("db=\""); | |
if( typeStart != -1 ){ | |
size_t dbEnd = dbStart + line[dbStart + 1 .. $ ].countUntil('"'); | |
result.type = line[typeStart + 1 .. typeEnd]; | |
} | |
size_t offsetStart = line.countUntil("offset="); | |
if( typeStart != -1 ){ | |
size_t offsetEnd = offsetStart + line[offsetStart + 1 .. $ ].countUntil(' '); | |
result.type = line[typeStart + 1 .. typeEnd]; | |
} | |
size_t maxItemsStart = line.countUntil("maxItems="); | |
if( typeStart != -1 ){ | |
size_t maxItemsEnd = maxItemsStart + line[maxItemsStart + 1 .. $ ].countUntil(' '); | |
result.type = line[typeStart + 1 .. typeEnd]; | |
} | |
size_t urlStart = line.countUntil("url=\""); | |
if( typeStart != -1 ){ | |
size_t urlEnd = urlStart + line[urlStart + 1 .. $ ].countUntil('"'); | |
result.type = line[typeStart + 1 .. typeEnd]; | |
} | |
size_t htmlUrlStart = line.countUntil("htmlUrl=\""); | |
if( typeStart != -1 ){ | |
size_t htmlUrlEnd = htmlUrlStart + line[htmlUrlStart + 1 .. $ ].countUntil('"'); | |
result.type = line[typeStart + 1 .. typeEnd]; | |
} | |
size_t bigDataUrlStart = line.countUntil("bigDataUrl=\""); | |
if( typeStart != -1 ){ | |
size_t bigDataUrlEnd = nameStart + line[bigDataUrlStart + 1 .. $ ].countUntil('"'); | |
result.type = line[typeStart + 1 .. typeEnd]; | |
} | |
return result; | |
} | |
auto bedReader( T = BedData3 )( in char[] filePath, char delimiter='\t' ){ | |
if( !filePath.exists ) | |
throw new FileException( "File %s is do not exist".format(filePath) ); | |
else if( !filePath.isFile ) | |
throw new FileException( "File %s is not a file".format(filePath) ); | |
File bedFile = File( to!string(filePath), "r" ); | |
BedMetadata metadata; | |
TrackLine trackLine; | |
Bed!(ReturnType!(csvReader!T())[]) bedInstance; | |
const string browserToken1 = "browser position"; | |
const string browserToken2 = "browser hide"; | |
const string trackToken = "track "; | |
foreach( char[] line; bedFile.byLine() ){ | |
if( line.startsWith( '#' ) ) // comment | |
continue; | |
else if( line.empty ) // empty line | |
continue; | |
else if( line.startsWith(browserToken1) ){ | |
size_t colonIndex = line.countUntil(':'); | |
size_t minusIndex = line[colonIndex .. $].countUntil('-'); | |
string reversed = to!string( retro( line[browserToken1.length .. colonIndex] ) ); | |
size_t spaceIndexBeforeChrom = reversed.countUntil(' '); | |
size_t spaceIndexAfterPosition = line[minusIndex..$].countUntil(' '); | |
size_t endPositionIndex = 0; | |
if(spaceIndexAfterPosition == -1) | |
endPositionIndex = line.length; | |
else | |
endPositionIndex = colonIndex + minusIndex + spaceIndexAfterPosition; | |
metadata.chromosome = to!string( retro(reversed[0 .. spaceIndexBeforeChrom]) ); | |
metadata.browserStart = to!size_t(line[colonIndex + 1 .. colonIndex + minusIndex]); | |
metadata.browserEnd = to!size_t(line[colonIndex + minusIndex + 1 .. endPositionIndex]); | |
} | |
else if( line.startsWith(browserToken2) ){ | |
if(line.length > browserToken2.length + 1) | |
data.hide = line[browserToken2.length + 1 .. $].idup; | |
else | |
throw new Exception("Malformed metadata line"); | |
} | |
else if( line.startsWith(trackToken) ){ | |
trackLine = trackLineReader( line ); | |
} | |
else{ // data in csv format | |
auto records = csvReader!T(line, delimiter); | |
bedInstance ~= records; | |
} | |
} | |
bedInstance.metadata = metadata; | |
bedInstance.trackLine = trackLine; | |
return bedInstance; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment