Skip to content

Instantly share code, notes, and snippets.

@brentp
Created February 12, 2009 17:50
Show Gist options
  • Save brentp/62770 to your computer and use it in GitHub Desktop.
Save brentp/62770 to your computer and use it in GitHub Desktop.
diff --git a/src/extended/gff3_out_stream.c b/src/extended/gff3_out_stream.c
index 6dbbc23..f813e8f 100644
--- a/src/extended/gff3_out_stream.c
+++ b/src/extended/gff3_out_stream.c
@@ -18,6 +18,7 @@
#include "extended/gff3_out_stream.h"
#include "extended/gff3_visitor.h"
#include "extended/node_stream_rep.h"
+#include "core/cstr_table.h"
struct GtGFF3OutStream {
const GtNodeStream parent_instance;
@@ -76,3 +77,11 @@ void gt_gff3_out_stream_set_fasta_width(GtNodeStream *gs,
gt_assert(gff3_out_stream);
gt_gff3_visitor_set_fasta_width(gff3_out_stream->gff3_visitor, fasta_width);
}
+
+void gt_gff3_out_stream_retain_id_attributes(GtNodeStream *gs)
+{
+ GtGFF3OutStream *gff3_out_stream = gff3_out_stream_cast(gs);
+ gt_assert(gff3_out_stream);
+ gt_gff3_visitor_retain_id_attributes(gff3_out_stream->gff3_visitor);
+
+}
diff --git a/src/extended/gff3_out_stream.h b/src/extended/gff3_out_stream.h
index adc3eba..e004c5d 100644
--- a/src/extended/gff3_out_stream.h
+++ b/src/extended/gff3_out_stream.h
@@ -28,5 +28,7 @@ const GtNodeStreamClass* gt_gff3_out_stream_class(void);
GtNodeStream* gt_gff3_out_stream_new(GtNodeStream*, GtGenFile*);
void gt_gff3_out_stream_set_fasta_width(GtNodeStream*,
unsigned long);
+void gt_gff3_out_stream_retain_id_attributes(
+ GtNodeStream *);
#endif
diff --git a/src/extended/gff3_visitor.c b/src/extended/gff3_visitor.c
index 8246680..1a26b72 100644
--- a/src/extended/gff3_visitor.c
+++ b/src/extended/gff3_visitor.c
@@ -23,21 +23,27 @@
#include "core/ma.h"
#include "core/unused_api.h"
#include "core/string_distri.h"
+#include "core/cstr_table.h"
+#include "core/str_api.h"
+#include "core/warning_api.h"
#include "extended/genome_node.h"
#include "extended/gff3_output.h"
#include "extended/gff3_parser.h"
#include "extended/gff3_visitor.h"
#include "extended/node_visitor_rep.h"
+#include <stdbool.h>
struct GtGFF3Visitor {
const GtNodeVisitor parent_instance;
bool version_string_shown,
+ retain_ids,
fasta_directive_shown;
GtStringDistri *id_counter;
GtHashmap *gt_feature_node_to_id_array,
*gt_feature_node_to_unique_id_str;
unsigned long fasta_width;
GtGenFile *outfp;
+ GtCstrTable *gt_used_ids;
};
typedef struct {
@@ -71,6 +77,7 @@ static void gff3_visitor_free(GtNodeVisitor *gv)
gt_string_distri_delete(gff3_visitor->id_counter);
gt_hashmap_delete(gff3_visitor->gt_feature_node_to_id_array);
gt_hashmap_delete(gff3_visitor->gt_feature_node_to_unique_id_str);
+ gt_cstr_table_delete(gff3_visitor->gt_used_ids);
}
static int gff3_visitor_comment_node(GtNodeVisitor *gv, GtCommentNode *cn,
@@ -190,18 +197,66 @@ static GtStr* create_unique_id(GtGFF3Visitor *gff3_visitor, GtFeatureNode *gf)
return id;
}
+static void make_unique_id_string(GtStr *current_id, unsigned long counter)
+{
+ /* name => name.1 */
+ gt_str_append_char(current_id, '.');
+ gt_str_append_ulong(current_id, counter);
+}
+
+static bool id_string_is_unique(GtStr *id, GtStr *buf, GtCstrTable *tab,
+ unsigned long i)
+{
+ gt_str_reset(buf);
+ gt_str_append_str(buf, id);
+ make_unique_id_string(buf, i);
+ return (gt_cstr_table_get(tab, gt_str_get(buf)) == NULL);
+}
+static void make_id_unique(GtGFF3Visitor *gff3_visitor, GtStr *id)
+{
+ unsigned long i = 1;
+
+ if (gt_cstr_table_get(gff3_visitor->gt_used_ids, gt_str_get(id)))
+ {
+ GtStr *buf = gt_str_new();
+ while (!id_string_is_unique(id, buf, gff3_visitor->gt_used_ids, i))
+ {
+ i++;
+ }
+ gt_warning("feature ID \"%s\" not unique: changing to %s", gt_str_get(id),
+ gt_str_get(buf));
+ /* update table with the new id */
+ gt_str_set(id, gt_str_get(buf));
+ gt_str_delete(buf);
+ }
+ gt_cstr_table_add(gff3_visitor->gt_used_ids, gt_str_get(id));
+}
+
static int store_ids(GtGenomeNode *gn, void *data, GtError *err)
{
GtGFF3Visitor *gff3_visitor = (GtGFF3Visitor*) data;
GtFeatureNode *gf = (GtFeatureNode*) gn;
AddIDInfo add_id_info;
int had_err = 0;
- GtStr *id;
+ bool has_id = false;
+ const char *id_string = gt_feature_node_get_attribute(gf, "ID");
+ GtStr *id; /* = gt_str_new_cstr(id_string); */
+ bool retain_ids = gff3_visitor->retain_ids;
gt_error_check(err);
gt_assert(gn && gf && gff3_visitor);
- if (gt_genome_node_has_children(gn) || gt_feature_node_is_multi(gf)) {
+ if (retain_ids && id_string) {
+ id = gt_str_new_cstr(id_string);
+ if (!gt_feature_node_is_multi(gf) ||
+ (gt_feature_node_is_multi(gf)
+ && gt_feature_node_get_multi_representative(gf) == gf)) {
+ make_id_unique(gff3_visitor, id);
+ }
+ gt_hashmap_add(gff3_visitor->gt_feature_node_to_unique_id_str, gf, id);
+ has_id = true;
+ }
+ else if (gt_genome_node_has_children(gn) || gt_feature_node_is_multi(gf)) {
if (gt_feature_node_is_multi(gf)) {
id = gt_hashmap_get(gff3_visitor->gt_feature_node_to_unique_id_str,
gt_feature_node_get_multi_representative(gf));
@@ -209,14 +264,18 @@ static int store_ids(GtGenomeNode *gn, void *data, GtError *err)
id = create_unique_id(gff3_visitor,
gt_feature_node_get_multi_representative(gf));
}
+
if (gt_feature_node_get_multi_representative(gf) != gf) {
gt_hashmap_add(gff3_visitor->gt_feature_node_to_unique_id_str, gf,
gt_str_ref(id));
}
}
- else
+ else {
id = create_unique_id(gff3_visitor, gf);
-
+ }
+ has_id = true;
+ }
+ if (has_id) {
/* for each child -> store the parent feature in the hash map */
add_id_info.gt_feature_node_to_id_array =
gff3_visitor->gt_feature_node_to_id_array,
@@ -224,6 +283,7 @@ static int store_ids(GtGenomeNode *gn, void *data, GtError *err)
had_err = gt_genome_node_traverse_direct_children(gn, &add_id_info, add_id,
err);
}
+ /* gt_str_delete(id); */
return had_err;
}
@@ -329,9 +389,18 @@ GtNodeVisitor* gt_gff3_visitor_new(GtGenFile *outfp)
HASH_DIRECT, NULL, (GtFree) gt_str_delete);
gff3_visitor->fasta_width = 0;
gff3_visitor->outfp = outfp;
+ /* if retain_ids is set to true, hen gt_used_ids is .. used. */
+ gff3_visitor->gt_used_ids = gt_cstr_table_new();
+ gff3_visitor->retain_ids = false;
return gv;
}
+void gt_gff3_visitor_retain_id_attributes(GtNodeVisitor *gv)
+{
+ GtGFF3Visitor *gff3_visitor = gff3_visitor_cast(gv);
+ gff3_visitor->retain_ids = true;
+}
+
void gt_gff3_visitor_set_fasta_width(GtNodeVisitor *gv,
unsigned long fasta_width)
{
diff --git a/src/extended/gff3_visitor.h b/src/extended/gff3_visitor.h
index 1939565..530371c 100644
--- a/src/extended/gff3_visitor.h
+++ b/src/extended/gff3_visitor.h
@@ -27,5 +27,6 @@ const GtNodeVisitorClass* gt_gff3_visitor_class(void);
GtNodeVisitor* gt_gff3_visitor_new(GtGenFile*);
void gt_gff3_visitor_set_fasta_width(GtNodeVisitor*,
unsigned long);
+void gt_gff3_visitor_retain_id_attributes(GtNodeVisitor *);
#endif
diff --git a/src/tools/gt_gff3.c b/src/tools/gt_gff3.c
index 02e4e35..cc8f7ff 100644
--- a/src/tools/gt_gff3.c
+++ b/src/tools/gt_gff3.c
@@ -35,6 +35,7 @@
typedef struct {
bool sort,
checkids,
+ retainids,
mergefeat,
addintrons,
verbose,
@@ -91,6 +92,14 @@ static GtOptionParser* gt_gff3_option_parser_new(void *tool_arguments)
"parsing", &arguments->tidy, false);
gt_option_parser_add_option(op, option);
+ /* -retainids */
+ option = gt_option_new_bool("retainids",
+ "when available, use the original IDs provided"
+ "in the source file\n"
+ "(memory consumption is O(file_size))",
+ &arguments->retainids, false);
+ gt_option_parser_add_option(op, option);
+
/* -checkids */
option = gt_option_new_bool("checkids",
"make sure the ID attributes are unique "
@@ -187,6 +196,7 @@ static int gt_gff3_runner(int argc, const char **argv, int parsed_args,
gt_gff3_in_stream_show_progress_bar((GtGFF3InStream*) gff3_in_stream);
if (arguments->checkids)
gt_gff3_in_stream_check_id_attributes((GtGFF3InStream*) gff3_in_stream);
+
last_stream = gff3_in_stream;
/* set different type checker if necessary */
@@ -243,6 +253,9 @@ static int gt_gff3_runner(int argc, const char **argv, int parsed_args,
gt_gff3_out_stream_set_fasta_width(gff3_out_stream, arguments->width);
}
+ if (!had_err && arguments->retainids)
+ gt_gff3_out_stream_retain_id_attributes(gff3_out_stream);
+
/* pull the features through the stream and free them afterwards */
if (!had_err)
had_err = gt_node_stream_pull(gff3_out_stream, err);
diff --git a/testdata/multi_feature_simple_retain.gff3 b/testdata/multi_feature_simple_retain.gff3
new file mode 100644
index 0000000..292dfff
--- /dev/null
+++ b/testdata/multi_feature_simple_retain.gff3
@@ -0,0 +1,6 @@
+##gff-version 3
+##sequence-region ctg123 1 1497228
+ctg123 . gene 1000 9000 . + . ID=gene1
+ctg123 . CDS 1201 1500 . + 0 ID=CDS1;Parent=gene1
+ctg123 . CDS 3000 3902 . + 0 ID=CDS1;Parent=gene1
+###
diff --git a/testdata/png_test_2.gff3 b/testdata/png_test_2.gff3
index 5217d18..2aae61d 100644
--- a/testdata/png_test_2.gff3
+++ b/testdata/png_test_2.gff3
@@ -10,3 +10,4 @@ ctg123 . exon 1050 1500 . . . Parent=mRNA00001,mRNA00002
ctg123 . exon 3000 3902 . . . Parent=mRNA00001,mRNA00003
ctg123 . exon 5000 5500 . . . Parent=mRNA00001,mRNA00002,mRNA00003
ctg123 . exon 7000 9000 . . . Parent=mRNA00001,mRNA00002,mRNA00003
+###
diff --git a/testdata/png_test_2_out.gff3 b/testdata/png_test_2_out.gff3
new file mode 100644
index 0000000..0b20a6b
--- /dev/null
+++ b/testdata/png_test_2_out.gff3
@@ -0,0 +1,13 @@
+##gff-version 3
+##sequence-region ctg123 1 10000
+ctg123 . gene 1000 9000 . . . ID=gene00001
+ctg123 . TF_binding_site 1000 1012 . . . Parent=gene00001
+ctg123 . mRNA 1050 9000 . . . ID=mRNA00001;Parent=gene00001
+ctg123 . mRNA 1050 9000 . . . ID=mRNA00002;Parent=gene00001
+ctg123 . mRNA 1300 9000 . . . ID=mRNA00003;Parent=gene00001
+ctg123 . exon 1050 1500 . . . Parent=mRNA00001,mRNA00002
+ctg123 . exon 3000 3902 . . . Parent=mRNA00001,mRNA00003
+ctg123 . exon 5000 5500 . . . Parent=mRNA00001,mRNA00002,mRNA00003
+ctg123 . exon 7000 9000 . . . Parent=mRNA00001,mRNA00002,mRNA00003
+ctg123 . exon 1300 1500 . . . Parent=mRNA00003
+###
diff --git a/testsuite/gt_gff3_include.rb b/testsuite/gt_gff3_include.rb
index 3525dc2..9faa602 100644
--- a/testsuite/gt_gff3_include.rb
+++ b/testsuite/gt_gff3_include.rb
@@ -612,6 +612,20 @@ Test do
run "diff #{$last_stdout} #{$testdata}two_fasta_seqs.gff3"
end
+Name "gt gff3 (-retainids)"
+Keywords "gt_gff3 retainids"
+Test do
+ run_test "#{$bin}gt gff3 -retainids #{$testdata}png_test_2.gff3"
+ run "diff #{$last_stdout} #{$testdata}png_test_2_out.gff3"
+end
+
+Name "gt gff3 multi-feature (-retainids)"
+Keywords "gt_gff3 multi-feature retainids"
+Test do
+ run_test "#{$bin}gt gff3 -retainids #{$testdata}multi_feature_simple.gff3"
+ run "diff #{$last_stdout} #{$testdata}multi_feature_simple_retain.gff3"
+end
+
Name "gt gff3 simple multi-feature (round-trip)"
Keywords "gt_gff3 multi-feature"
Test do
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment