Skip to content

Instantly share code, notes, and snippets.

@brentp
Created February 10, 2009 05:19
Show Gist options
  • Save brentp/61251 to your computer and use it in GitHub Desktop.
Save brentp/61251 to your computer and use it in GitHub Desktop.
diff --git a/src/extended/gff3_out_stream.c b/src/extended/gff3_out_stream.c
index 6dbbc23..f813e8f 100644
--- a/src/extended/gff3_out_stream.c
+++ b/src/extended/gff3_out_stream.c
@@ -18,6 +18,7 @@
#include "extended/gff3_out_stream.h"
#include "extended/gff3_visitor.h"
#include "extended/node_stream_rep.h"
+#include "core/cstr_table.h"
struct GtGFF3OutStream {
const GtNodeStream parent_instance;
@@ -76,3 +77,11 @@ void gt_gff3_out_stream_set_fasta_width(GtNodeStream *gs,
gt_assert(gff3_out_stream);
gt_gff3_visitor_set_fasta_width(gff3_out_stream->gff3_visitor, fasta_width);
}
+
+void gt_gff3_out_stream_retain_id_attributes(GtNodeStream *gs)
+{
+ GtGFF3OutStream *gff3_out_stream = gff3_out_stream_cast(gs);
+ gt_assert(gff3_out_stream);
+ gt_gff3_visitor_retain_id_attributes(gff3_out_stream->gff3_visitor);
+
+}
diff --git a/src/extended/gff3_out_stream.h b/src/extended/gff3_out_stream.h
index adc3eba..e004c5d 100644
--- a/src/extended/gff3_out_stream.h
+++ b/src/extended/gff3_out_stream.h
@@ -28,5 +28,7 @@ const GtNodeStreamClass* gt_gff3_out_stream_class(void);
GtNodeStream* gt_gff3_out_stream_new(GtNodeStream*, GtGenFile*);
void gt_gff3_out_stream_set_fasta_width(GtNodeStream*,
unsigned long);
+void gt_gff3_out_stream_retain_id_attributes(
+ GtNodeStream *);
#endif
diff --git a/src/extended/gff3_visitor.c b/src/extended/gff3_visitor.c
index 8246680..02432f7 100644
--- a/src/extended/gff3_visitor.c
+++ b/src/extended/gff3_visitor.c
@@ -23,21 +23,26 @@
#include "core/ma.h"
#include "core/unused_api.h"
#include "core/string_distri.h"
+#include "core/cstr_table.h"
+#include "core/str_api.h"
#include "extended/genome_node.h"
#include "extended/gff3_output.h"
#include "extended/gff3_parser.h"
#include "extended/gff3_visitor.h"
#include "extended/node_visitor_rep.h"
+#include <stdbool.h>
struct GtGFF3Visitor {
const GtNodeVisitor parent_instance;
bool version_string_shown,
+ retain_ids,
fasta_directive_shown;
GtStringDistri *id_counter;
GtHashmap *gt_feature_node_to_id_array,
*gt_feature_node_to_unique_id_str;
unsigned long fasta_width;
GtGenFile *outfp;
+ GtCstrTable *gt_used_ids;
};
typedef struct {
@@ -190,6 +195,30 @@ static GtStr* create_unique_id(GtGFF3Visitor *gff3_visitor, GtFeatureNode *gf)
return id;
}
+static void make_unique_id_string(GtStr *current_id, unsigned long counter)
+{
+ /* name => name.1 */
+ gt_str_append_char(current_id, '.');
+ gt_str_append_ulong(current_id, counter);
+}
+
+static void make_id_unique(GtGFF3Visitor *gff3_visitor, GtStr *id)
+{
+
+ GtCstrTable *used_ids = gff3_visitor->gt_used_ids;
+ unsigned long i = 0;
+ const char *id_string = gt_str_get(id);
+
+ while ( gt_cstr_table_get(used_ids, gt_str_get(id) )) {
+ /* TODO: add warning */
+ make_unique_id_string(id, ++i);
+ gt_str_set(id, id_string);
+ }
+ /* update table with the new id */
+ gt_cstr_table_add(used_ids, gt_str_get(id));
+
+}
+
static int store_ids(GtGenomeNode *gn, void *data, GtError *err)
{
GtGFF3Visitor *gff3_visitor = (GtGFF3Visitor*) data;
@@ -197,32 +226,48 @@ static int store_ids(GtGenomeNode *gn, void *data, GtError *err)
AddIDInfo add_id_info;
int had_err = 0;
GtStr *id;
+ const char *id_string;
+ bool has_id = false;
gt_error_check(err);
gt_assert(gn && gf && gff3_visitor);
+ id_string = gt_feature_node_get_attribute(gf, "ID");
- if (gt_genome_node_has_children(gn) || gt_feature_node_is_multi(gf)) {
- if (gt_feature_node_is_multi(gf)) {
- id = gt_hashmap_get(gff3_visitor->gt_feature_node_to_unique_id_str,
- gt_feature_node_get_multi_representative(gf));
- if (!id) { /* the representative does not have its own id */
- id = create_unique_id(gff3_visitor,
- gt_feature_node_get_multi_representative(gf));
- }
+ if (id_string) {
+ id = gt_str_new_cstr(id_string);
+ has_id = true;
+ }
+ else {
+ /* no id, but it's a multi feature. can this even happen? */
+ if (gt_feature_node_is_multi(gf) || gt_genome_node_has_children(gn) ) {
if (gt_feature_node_get_multi_representative(gf) != gf) {
- gt_hashmap_add(gff3_visitor->gt_feature_node_to_unique_id_str, gf,
- gt_str_ref(id));
+ id = gt_hashmap_get(gff3_visitor->gt_feature_node_to_unique_id_str,
+ gt_feature_node_get_multi_representative(gf));
+ has_id = true;
}
}
- else
- id = create_unique_id(gff3_visitor, gf);
+ /* it doesnt have an id and it is not a child-feature */
+ else if ( ! gt_feature_node_get_attribute(gf, "Parent")) {
+ id = create_unique_id(gff3_visitor, gf);
+ has_id = true;
+ }
+
+ }
+ if (has_id) {
+ if (gff3_visitor->retain_ids) {
+ make_id_unique(gff3_visitor, id);
+ }
+ gt_hashmap_add(gff3_visitor->gt_feature_node_to_unique_id_str, gf,
+ gt_str_ref(id));
/* for each child -> store the parent feature in the hash map */
add_id_info.gt_feature_node_to_id_array =
- gff3_visitor->gt_feature_node_to_id_array,
+ gff3_visitor->gt_feature_node_to_id_array,
add_id_info.id = gt_str_get(id);
had_err = gt_genome_node_traverse_direct_children(gn, &add_id_info, add_id,
err);
+ /* Q: needed? */
+ gt_str_delete(id);
}
return had_err;
}
@@ -262,8 +307,9 @@ static int gff3_visitor_feature_node(GtNodeVisitor *gv, GtFeatureNode *fn,
/* show terminator, if the feature has children (otherwise it is clear that
the feature is complete, because no ID attribute has been shown) */
- if (gt_genome_node_has_children((GtGenomeNode*) fn))
- gt_genfile_xprintf(gff3_visitor->outfp, "%s\n", GFF_TERMINATOR);
+ /* if (gt_genome_node_has_children((GtGenomeNode*) fn)) */
+ gt_genfile_xprintf(gff3_visitor->outfp, "%s\n", GFF_TERMINATOR);
+ /* printf("%s\n", gt_feature_node_get_attribute(fn, "ID")); */
return had_err;
}
@@ -329,9 +375,18 @@ GtNodeVisitor* gt_gff3_visitor_new(GtGenFile *outfp)
HASH_DIRECT, NULL, (GtFree) gt_str_delete);
gff3_visitor->fasta_width = 0;
gff3_visitor->outfp = outfp;
+ /* if retain_ids is set to true, hen gt_used_ids is .. used. */
+ gff3_visitor->gt_used_ids = gt_cstr_table_new();
+ gff3_visitor->retain_ids = false;
return gv;
}
+void gt_gff3_visitor_retain_id_attributes(GtNodeVisitor *gv)
+{
+ GtGFF3Visitor *gff3_visitor = gff3_visitor_cast(gv);
+ gff3_visitor->retain_ids = true;
+}
+
void gt_gff3_visitor_set_fasta_width(GtNodeVisitor *gv,
unsigned long fasta_width)
{
diff --git a/src/extended/gff3_visitor.h b/src/extended/gff3_visitor.h
index 1939565..530371c 100644
--- a/src/extended/gff3_visitor.h
+++ b/src/extended/gff3_visitor.h
@@ -27,5 +27,6 @@ const GtNodeVisitorClass* gt_gff3_visitor_class(void);
GtNodeVisitor* gt_gff3_visitor_new(GtGenFile*);
void gt_gff3_visitor_set_fasta_width(GtNodeVisitor*,
unsigned long);
+void gt_gff3_visitor_retain_id_attributes(GtNodeVisitor *);
#endif
diff --git a/src/tools/gt_gff3.c b/src/tools/gt_gff3.c
index 74a1457..f498678 100644
--- a/src/tools/gt_gff3.c
+++ b/src/tools/gt_gff3.c
@@ -35,6 +35,7 @@
typedef struct {
bool sort,
checkids,
+ retainids,
mergefeat,
addintrons,
verbose,
@@ -91,6 +92,14 @@ static GtOptionParser* gt_gff3_option_parser_new(void *tool_arguments)
"parsing", &arguments->tidy, false);
gt_option_parser_add_option(op, option);
+ /* -retainids */
+ option = gt_option_new_bool("retainids",
+ "when available, use the original IDs provided"
+ "in the source file\n"
+ "(memory consumption is O(file_size))",
+ &arguments->retainids, true);
+ gt_option_parser_add_option(op, option);
+
/* -checkids */
option = gt_option_new_bool("checkids",
"make sure the ID attributes are unique "
@@ -188,6 +197,7 @@ static int gt_gff3_runner(int argc, const char **argv, int parsed_args,
gt_gff3_in_stream_show_progress_bar((GtGFF3InStream*) gff3_in_stream);
if (arguments->checkids)
gt_gff3_in_stream_check_id_attributes((GtGFF3InStream*) gff3_in_stream);
+
last_stream = gff3_in_stream;
/* set different type checker if necessary */
@@ -244,6 +254,9 @@ static int gt_gff3_runner(int argc, const char **argv, int parsed_args,
gt_gff3_out_stream_set_fasta_width(gff3_out_stream, arguments->width);
}
+ if (!had_err && arguments->retainids)
+ gt_gff3_out_stream_retain_id_attributes(gff3_out_stream);
+
/* pull the features through the stream and free them afterwards */
if (!had_err) {
while (!(had_err = gt_node_stream_next(gff3_out_stream, &gn, err)) &&
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment