Skip to content

Instantly share code, notes, and snippets.

@lell
Created May 5, 2016 17:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lell/b1d2dfed459cc710e2ea170d51130f58 to your computer and use it in GitHub Desktop.
Save lell/b1d2dfed459cc710e2ea170d51130f58 to your computer and use it in GitHub Desktop.
function [rsids, chromStart, chromEnd, chrom] = gene2snps(gene, varargin)
%GENE2SNPS find all SNPs lying within a given human gene.
%
% [rsids, chromStart, chromEnd, chrom] = GENE2SNPS(GENE, ...), returns the
% RSIDs of all SNPs lying within the human gene given by GENE,
% according to reference genome hg38 and SNP database snp146. The
% returned variable RSIDS is a cell array of strings specifying the
% SNPs. Bounds for the chromosome locations of the returned SNPs are
% returned in the variables CHROMSTART and CHROMEND. The chromosome on
% which the gene lies is returned in the variable CHROM.
%
% Inputs:
% GENE string naming the given gene (e.g., 'brca1' or 'apoe').
%
% The following optional input arguments can be provided using
% MATLAB's 'param'/'value' syntax:
%
% 'Flank', flank real scalar in the interval [0, 1] specifying how
% far to search on either side of the gene for flanking
% SNPs. If FLANK is positive, the region in which SNPs are
% searched for will be increased by FLANK times the length
% of the given gene (in basepairs), extending from both ends
% of the gene. Default is zero (no flanking SNPs included).
%
% 'Outfile', filename string specifying that the RSIDs should, in
% addition to being returned, also be printed to the file
% given by FILENAME as a newline delimited list.
%
% Outputs:
% RSIDS cell array of strings specifying the SNPs in the given gene
% and (optionally) the flanking region.
%
% CHROMSTART integer scalar indicating the position (in basepairs) of
% the leftmost SNP.
%
% CHROMEND integer scalar indicating the position (in basepairs) of
% the rightmost SNP.
%
% CHROM string naming the chromosome on which the given gene is found
% in humans.
%
% Example:
%
% > %%% %%% %%%
% > rsids = gene2snps('apoe')
% rsids =
%
% Columns 1 through 6
%
% 'rs539159437' 'rs767814822' 'rs758379972' 'rs766215051'
%
% Columns 7 through 9
%
% 'rs750782549' 'rs751150968' 'rs577386185' 'rs780699141'
%
% ...
% ...
%
% Copyright (c) 2016, Lloyd T. Elliott. All rights reserved.
%
% Redistribution and use in source and binary forms, with or without
% modification, are permitted provided that the following conditions are met:
%
% 1. Redistributions of source code must retain the above copyright notice, this
% list of conditions and the following disclaimer.
%
% 2. Redistributions in binary form must reproduce the above copyright notice,
% this list of conditions and the following disclaimer in the documentation
% and/or other materials provided with the distribution.
%
% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
% DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
% FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
% DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
% SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
% CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
% OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
% OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
parser=inputParser;
default_flank=0.0;
default_outfile=[];
addRequired(parser,'gene',@isstr);
addOptional(parser,'Flank',default_flank,@isnumeric);
addOptional(parser,'Outfile',default_outfile,@isstr);
parse(parser,gene,varargin{:});
flank=parser.Results.Flank;
outfile=parser.Results.Outfile;
gene = upper(gene);
command = sprintf('mysql --user=anonymous --host=ensembldb.ensembl.org --port=3306 -A -D homo_sapiens_core_84_38 -e ''select gene_id from gene_attrib where value = "%s"''|cat', gene);
[~, output] = system(command);
cols = regexp(output, '\s+', 'split');
if length(cols) ~= 3
error(sprintf('gene2snps: Couldn''t find gene_id for gene %s.\n', gene));
end
gene_id = cols{2};
command = sprintf('mysql --user=anonymous --host=ensembldb.ensembl.org --port=3306 -A -D homo_sapiens_core_84_38 -e ''select stable_id from gene where gene_id="%s"'' |cat', gene_id);
[~, output] = system(command);
cols = regexp(output, '\s+', 'split');
if length(cols) ~= 3
error(sprintf('gene2snps: Couldn''t find ensg for gene %s.\n', gene));
end
ensg = cols{2};
command = sprintf('mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -D hg38 -e "select chromStart, chromEnd, chrom from knownCanonical where protein like ''%s%%''" |cat', ensg);
[~, output] = system(command);
cols = regexp(output, '\s+', 'split');
if length(cols) == 3
error(sprintf('gene2snps: Couldn''t find ensg ''%s'' in ucsc table browser.', ensg));
elseif length(cols)-1 == 6
chromStart = str2num(cols{4});
chromEnd = str2num(cols{5});
chrom = cols{6};
else
chromStart = Inf;
chromEnd = -Inf;
chrom = cols{6};
for i = 1:((length(cols)-1)/3)
a = str2num(cols{i*3 + 1});
b = str2num(cols{i*3 + 2});
chrom0 = cols{i*3 + 3};
if ~strcmp(chrom, chrom0)
error(sprintf('gene2snps: Found the same ensgs on multiple chromosome (%s and %s).', chrom, chrom0));
end
if a < chromStart
chromStart = a;
end
if b > chromEnd
chromEnd = b;
end
end
end
len = chromEnd - chromStart;
flank = round(len * flank);
command = sprintf('mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -D hg38 -e "select name from snp146 where chromStart >= %d and chromEnd <= %d and chrom = ''%s''" | cat', chromStart - flank, chromEnd + flank, chrom);
[~, output] = system(command);
rsids = regexp(output, '\s+', 'split');
rsids = rsids(2:(end-1));
if ~isempty(outfile)
fd = fopen(outfile, 'w');
for i = 1:length(rsids)
fprintf(fd, '%s\n', rsids{i});
end
fclose(fd);
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment