Created
September 3, 2019 00:25
-
-
Save markziemann/3fc0c90e59c508c66067681a6c6dc3a1 to your computer and use it in GitHub Desktop.
Create a library of gene sets based on protein domains
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This script creates a GMT file of genesets classified by protein domains | |
# First need to obtain some data from ensembl biomart | |
# Go to https://www.ensembl.org/biomart/martview/ | |
# Select human database | |
# Select the following attributes: | |
# - Gene stable ID | |
# - Interpro ID | |
# - Interpro Short Description | |
# - Interpro Description | |
# - HGNC symbol | |
DAT=mart_export.txt | |
for IPR in $(cut -f2 $DAT | sed 1d | sort -u | head -5) ; do | |
NAME=$(grep -wm1 $IPR $DAT | cut -f4) | |
grep -w $IPR $DAT | cut -f5 | sort -u | paste -s | sed "s#^#${NAME}\t${IPR}\t#" | |
done > ipr.gmt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment