Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created April 4, 2024 10:59
Show Gist options
  • Save lindenb/a20692a5f34d9da187cc834ad08c1896 to your computer and use it in GitHub Desktop.
Save lindenb/a20692a5f34d9da187cc834ad08c1896 to your computer and use it in GitHub Desktop.
Tool to Identify Gene, Regulatory Role, and Function at Integration Sites https://www.biostars.org/p/9591769/
BEGIN {
FS="\t";
}
($3=="gene") {
gene_id="";
gene_name=""
gene_biotype=""
N=split($9,a,/[ ]*[;][ ]*/);
for(i=1;i<=N;++i) {
N2 = split(a[i],b,/[ ]/);
K = b[1];
V=b[2];
gsub(/"/,"",V);
if(K=="gene_id") gene_id=V;
else if(K=="gene_name") gene_name=V;
else if(K=="gene_biotype") gene_biotype=V;
}
if(gene_id=="") next;
printf("<bio:Gene rdf:about=\"%s\">\n",gene_id);
printf("\t<bio:gene_id>%s</bio:gene_id>\n",gene_id);
if(gene_name!="") printf("\t<bio:gene_name>%s</bio:gene_name>\n",gene_name);
if(gene_biotype!="") printf("\t<bio:gene_biotype>%s</bio:gene_biotype>\n",gene_biotype);
printf("\t<bio:location>\n");
printf("\t\t<bio:Location>\n");
printf("\t\t\t<bio:build>%s</bio:build>\n",BUILD);
printf("\t\t\t<bio:chrom>%s</bio:chrom>\n",$1);
printf("\t\t\t<bio:start rdf:datatype=\"http://www.w3.org/2001/XMLSchema#int\">%s</bio:start>\n",$4);
printf("\t\t\t<bio:end rdf:datatype=\"http://www.w3.org/2001/XMLSchema#int\">%s</bio:end>\n",$5);
printf("\t\t</bio:Location>\n");
printf("\t</bio:location>\n");
printf("</bio:Gene>\n");
}
SHELL=/bin/bash
OUTDIR=TMP
BUILD=GRCh38
all: $(OUTDIR)/database.rdf query.01.sparql
/path/to/pache-jena-4.8.0/bin/arq --data=$< --query=query.01.sparql
$(OUTDIR)/database.rdf: $(OUTDIR)/go.rdf $(OUTDIR)/gtf.rdf
mkdir -p $(dir $@)
echo '<?xml version="1.0" encoding="UTF-8"?><rdf:RDF xmlns:bio="https://www.biostars.org/#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xml:base="https://www.biostars.org/">' > $@
cat $^ >> $@
echo "</rdf:RDF>" >> $@
$(OUTDIR)/go.rdf:
mkdir -p $(dir $@)
wget -O - "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz" | gunzip -c |\
awk -F '\t' '$$1==9606' | cut -f 2,3,6 | sort -T $(dir $@) -t $$'\t' -k1,1 > $(addsuffix .tmp1,$@)
wget -O - "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2ensembl.gz" | gunzip -c |\
awk -F '\t' '$$1==9606' | cut -f 2,3 | sort -T $(dir $@) -t $$'\t' -k1,1 > $(addsuffix .tmp2,$@)
join -t $$'\t' -1 1 -2 1 $(addsuffix .tmp1,$@) $(addsuffix .tmp2,$@) > $(addsuffix .tmp3,$@)
cut -f 2,3 $(addsuffix .tmp3,$@) | sort -T $(dir $@) | uniq |\
awk -F '\t' '{GO=$$1;gsub(/:/,"_",GO); printf("<bio:Term rdf:about=\"%s\"><bio:go_id>%s</bio:go_id><rdfs:label>%s</rdfs:label></bio:Term>\n",GO,$$1,$$2);}' >> $@
cut -f 2,4 $(addsuffix .tmp3,$@) | awk -F '\t' '{GO=$$1;gsub(/:/,"_",GO); printf("<rdf:Description rdf:about=\"%s\"><bio:has_go_term rdf:resource=\"%s\"/></rdf:Description>\n",$$2,GO);}' >> $@
rm $(addsuffix .tmp1,$@) $(addsuffix .tmp2,$@) $(addsuffix .tmp3,$@)
$(OUTDIR)/gtf.rdf : gtf2rdf.awk
mkdir -p $(dir $@)
wget -O - "https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.$(BUILD).111.chr.gtf.gz" | gunzip -c |\
awk '($$1=="1")' |\
awk -vBUILD=$(BUILD) -f gtf2rdf.awk > $@
PREFIX bio: <https://www.biostars.org/#>
PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
SELECT
?build
?chrom
?start
?end
?gene_id
?gene_name
?gene_biotype
?go_id
?go_label
WHERE {
?gene bio:gene_name ?gene_name .
?gene bio:gene_biotype ?gene_biotype .
?gene bio:gene_id ?gene_id .
?gene bio:location ?loc .
?loc a bio:Location .
?loc bio:build ?build .
?loc bio:chrom ?chrom .
?loc bio:start ?start .
?loc bio:end ?end .
OPTIONAL {
?gene bio:has_go_term ?go .
?go bio:go_id ?go_id .
?go rdfs:label ?go_label .
}
FILTER( ?start <= 20746689 ) .
FILTER( ?end >= 20746689 ) .
FILTER( ?chrom = "1" ) .
}
We can make this file beautiful and searchable if this error is corrected: No tabs found in this TSV file in line 0.
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
| build | chrom | start | end | gene_id | gene_name | gene_biotype | go_id | go_label |
================================================================================================================================================================================================================================================
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0006334" | "nucleosome assembly" |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0000786" | "nucleosome" |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0005634" | "nucleus" |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0070828" | "heterochromatin organization" |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0031491" | "nucleosome binding" |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0006355" | "regulation of DNA-templated transcription" |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0003677" | "DNA binding" |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0005694" | "chromosome" |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0016607" | "nuclear speck" |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0042127" | "regulation of cell population proliferation" |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0005515" | "protein binding" |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0097298" | "regulation of nucleus size" |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0071456" | "cellular response to hypoxia" |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment