Skip to content

Instantly share code, notes, and snippets.

@jimhester
Created January 23, 2013 20:55
Show Gist options
  • Save jimhester/4613075 to your computer and use it in GitHub Desktop.
Save jimhester/4613075 to your computer and use it in GitHub Desktop.
Reading gff/gtf files with Rcpp
#include <fstream>
#include <string>
#include <Rcpp.h>
#include <map>
#include <set>
using namespace Rcpp;
using namespace std;
vector<string> inline split_string(const string &source, const char *delimiter = " ", bool keep_empty = false) {
vector<string> results;
size_t prev = 0;
size_t next = 0;
while ((next = source.find_first_of(delimiter, prev)) != string::npos) {
if (keep_empty || (next - prev != 0)) {
results.push_back(source.substr(prev, next - prev));
}
prev = next + 1;
}
if (prev < source.size()) {
results.push_back(source.substr(prev));
}
return results;
}
const uint FIELD_SIZE=8;
const string FIELDS[FIELD_SIZE] = {"seqid", "source", "type", "start", "end", "score", "strand", "phase"};
// [[Rcpp::export]]
Rcpp::DataFrame gff2df(std::string file, const char *attribute_sep = "=") {
CharacterVector records;
vector< vector < string > > column_strings(FIELD_SIZE);
vector< map< string, string > > attribute_list;
set< string > attribute_types;
ifstream in(file.c_str());
string rec;
int count=0;
while(getline(in,rec)){
if(rec.at(0) != '#'){ //not a comment line
vector< string > strings = split_string(rec,"\t");
for(uint i = 0;i<strings.size()-1;++i){
column_strings[i].push_back(strings.at(i));
}
vector< string > attribute_strings = split_string(strings.at(strings.size()-1), ";");
map< string, string> line_attributes;
for(uint i = 0;i< attribute_strings.size();++i){
vector< string > pair = split_string(attribute_strings.at(i), attribute_sep);
line_attributes[pair.at(0)] = pair.at(1);
if(attribute_types.find(pair.at(0)) == attribute_types.end()){
attribute_types.insert(pair.at(0));
}
}
attribute_list.push_back(line_attributes);
}
else if(rec.find("##FASTA") != string::npos){
break;
}
count++;
}
Rcpp::DataFrame result;
for(uint i = 0;i<FIELD_SIZE;++i){
result[FIELDS[i]]=column_strings.at(i);
}
for(set< string >::const_iterator it = attribute_types.begin(); it != attribute_types.end(); ++it){
vector< string > column_data;
for(vector< map<string, string > >::const_iterator vec_it = attribute_list.begin();vec_it != attribute_list.end();++vec_it){
if(vec_it->count(*it) == 1){
column_data.push_back(vec_it->at(*it));
}
else{
column_data.push_back("NA");
}
}
result[*it]=column_data;
}
return(result);
}
require(Rcpp)
require(GenomicRanges)
sourceCpp('read_gff.cpp')
read_gff = function(file, grouping='feature_id', attribute_sep='=') {
data = data.frame(lapply(gff2df(file, attribute_sep), all_numeric, what="vector"))
if(!grouping %in% colnames(data)){
stop(paste(grouping, "does not exist in", file, ", valid columns are", paste(colnames(data), collapse=" ")))
}
stopifnot(grouping %in% colnames(data))
lapply(split(data, factor(data$seqid)), function(df) {
split(GRanges( ranges=IRanges( start=df$start, end=df$end),
seqnames=df$seqid,
strand=df$strand,
mcols = df[, !colnames(df) %in% c("seqid", "strand", "start", "end") ]
), df[,grouping], drop=TRUE)
})
}
#from Hmisc library
all_numeric = function (x, what = c("test", "vector"), extras = c(".", "NA")) {
what <- match.arg(what)
old <- options(warn = -1)
on.exit(options(old))
x <- sub("[[:space:]]+$", "", x)
x <- sub("^[[:space:]]+", "", x)
xs <- x[! x %in% c("", extras)]
isnum <- !any(is.na(as.numeric(xs)))
if (what == "test"){
isnum
}
else if (isnum){
as.numeric(x)
}
else {
x
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment