Created
May 5, 2016 17:41
-
-
Save lell/b1d2dfed459cc710e2ea170d51130f58 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function [rsids, chromStart, chromEnd, chrom] = gene2snps(gene, varargin) | |
%GENE2SNPS find all SNPs lying within a given human gene. | |
% | |
% [rsids, chromStart, chromEnd, chrom] = GENE2SNPS(GENE, ...), returns the | |
% RSIDs of all SNPs lying within the human gene given by GENE, | |
% according to reference genome hg38 and SNP database snp146. The | |
% returned variable RSIDS is a cell array of strings specifying the | |
% SNPs. Bounds for the chromosome locations of the returned SNPs are | |
% returned in the variables CHROMSTART and CHROMEND. The chromosome on | |
% which the gene lies is returned in the variable CHROM. | |
% | |
% Inputs: | |
% GENE string naming the given gene (e.g., 'brca1' or 'apoe'). | |
% | |
% The following optional input arguments can be provided using | |
% MATLAB's 'param'/'value' syntax: | |
% | |
% 'Flank', flank real scalar in the interval [0, 1] specifying how | |
% far to search on either side of the gene for flanking | |
% SNPs. If FLANK is positive, the region in which SNPs are | |
% searched for will be increased by FLANK times the length | |
% of the given gene (in basepairs), extending from both ends | |
% of the gene. Default is zero (no flanking SNPs included). | |
% | |
% 'Outfile', filename string specifying that the RSIDs should, in | |
% addition to being returned, also be printed to the file | |
% given by FILENAME as a newline delimited list. | |
% | |
% Outputs: | |
% RSIDS cell array of strings specifying the SNPs in the given gene | |
% and (optionally) the flanking region. | |
% | |
% CHROMSTART integer scalar indicating the position (in basepairs) of | |
% the leftmost SNP. | |
% | |
% CHROMEND integer scalar indicating the position (in basepairs) of | |
% the rightmost SNP. | |
% | |
% CHROM string naming the chromosome on which the given gene is found | |
% in humans. | |
% | |
% Example: | |
% | |
% > %%% %%% %%% | |
% > rsids = gene2snps('apoe') | |
% rsids = | |
% | |
% Columns 1 through 6 | |
% | |
% 'rs539159437' 'rs767814822' 'rs758379972' 'rs766215051' | |
% | |
% Columns 7 through 9 | |
% | |
% 'rs750782549' 'rs751150968' 'rs577386185' 'rs780699141' | |
% | |
% ... | |
% ... | |
% | |
% Copyright (c) 2016, Lloyd T. Elliott. All rights reserved. | |
% | |
% Redistribution and use in source and binary forms, with or without | |
% modification, are permitted provided that the following conditions are met: | |
% | |
% 1. Redistributions of source code must retain the above copyright notice, this | |
% list of conditions and the following disclaimer. | |
% | |
% 2. Redistributions in binary form must reproduce the above copyright notice, | |
% this list of conditions and the following disclaimer in the documentation | |
% and/or other materials provided with the distribution. | |
% | |
% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
% DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | |
% FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
% DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
% SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
% CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
% OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
% OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
parser=inputParser; | |
default_flank=0.0; | |
default_outfile=[]; | |
addRequired(parser,'gene',@isstr); | |
addOptional(parser,'Flank',default_flank,@isnumeric); | |
addOptional(parser,'Outfile',default_outfile,@isstr); | |
parse(parser,gene,varargin{:}); | |
flank=parser.Results.Flank; | |
outfile=parser.Results.Outfile; | |
gene = upper(gene); | |
command = sprintf('mysql --user=anonymous --host=ensembldb.ensembl.org --port=3306 -A -D homo_sapiens_core_84_38 -e ''select gene_id from gene_attrib where value = "%s"''|cat', gene); | |
[~, output] = system(command); | |
cols = regexp(output, '\s+', 'split'); | |
if length(cols) ~= 3 | |
error(sprintf('gene2snps: Couldn''t find gene_id for gene %s.\n', gene)); | |
end | |
gene_id = cols{2}; | |
command = sprintf('mysql --user=anonymous --host=ensembldb.ensembl.org --port=3306 -A -D homo_sapiens_core_84_38 -e ''select stable_id from gene where gene_id="%s"'' |cat', gene_id); | |
[~, output] = system(command); | |
cols = regexp(output, '\s+', 'split'); | |
if length(cols) ~= 3 | |
error(sprintf('gene2snps: Couldn''t find ensg for gene %s.\n', gene)); | |
end | |
ensg = cols{2}; | |
command = sprintf('mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -D hg38 -e "select chromStart, chromEnd, chrom from knownCanonical where protein like ''%s%%''" |cat', ensg); | |
[~, output] = system(command); | |
cols = regexp(output, '\s+', 'split'); | |
if length(cols) == 3 | |
error(sprintf('gene2snps: Couldn''t find ensg ''%s'' in ucsc table browser.', ensg)); | |
elseif length(cols)-1 == 6 | |
chromStart = str2num(cols{4}); | |
chromEnd = str2num(cols{5}); | |
chrom = cols{6}; | |
else | |
chromStart = Inf; | |
chromEnd = -Inf; | |
chrom = cols{6}; | |
for i = 1:((length(cols)-1)/3) | |
a = str2num(cols{i*3 + 1}); | |
b = str2num(cols{i*3 + 2}); | |
chrom0 = cols{i*3 + 3}; | |
if ~strcmp(chrom, chrom0) | |
error(sprintf('gene2snps: Found the same ensgs on multiple chromosome (%s and %s).', chrom, chrom0)); | |
end | |
if a < chromStart | |
chromStart = a; | |
end | |
if b > chromEnd | |
chromEnd = b; | |
end | |
end | |
end | |
len = chromEnd - chromStart; | |
flank = round(len * flank); | |
command = sprintf('mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -D hg38 -e "select name from snp146 where chromStart >= %d and chromEnd <= %d and chrom = ''%s''" | cat', chromStart - flank, chromEnd + flank, chrom); | |
[~, output] = system(command); | |
rsids = regexp(output, '\s+', 'split'); | |
rsids = rsids(2:(end-1)); | |
if ~isempty(outfile) | |
fd = fopen(outfile, 'w'); | |
for i = 1:length(rsids) | |
fprintf(fd, '%s\n', rsids{i}); | |
end | |
fclose(fd); | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment