Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created June 9, 2021 08:37
Show Gist options
  • Save lindenb/00267989d5c14289ab5b20d2bae68b9f to your computer and use it in GitHub Desktop.
Save lindenb/00267989d5c14289ab5b20d2bae68b9f to your computer and use it in GitHub Desktop.
Fast GATK: calling a small region for a large number of Bams with GATK HaplotypeCaller
/*
The MIT License (MIT)
Copyright (c) 2021 Pierre Lindenbaum
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
//package com.github.lindenb.jvarkit.tools.gatk;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import org.broadinstitute.hellbender.*;
import java.util.*;
import htsjdk.samtools.util.*;
import java.io.*;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineArgumentParser;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
public class FastGatk extends Main {
private static final Logger LOG = LogManager.getLogger(Main.class);
@Argument(fullName = "base-dir", shortName = "D", doc = "base workfing directory",optional=false)
private File baseDir=null;
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "File to which variants should be written")
private File finalVcf = null;
@Argument(fullName = StandardArgumentDefinitions.VERBOSITY_NAME, shortName = StandardArgumentDefinitions.VERBOSITY_NAME, doc = "Control verbosity of logging.", common = true, optional = true)
public Log.LogLevel VERBOSITY = Log.LogLevel.INFO;
@Argument(fullName = StandardArgumentDefinitions.REFERENCE_LONG_NAME, shortName = StandardArgumentDefinitions.REFERENCE_SHORT_NAME, doc = "Indexed fasta reference", common = true, optional = false)
private File reference=null;
@Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, doc = "A file containing the path to the bams", common = true, optional = false)
private File bamsIn= null;
@Argument(fullName = "minimum-mapq", shortName = "Q", doc = "minimum mapping quality", common = true, optional = true)
private int mapq= 10;
@Argument(fullName = "dbsnp", shortName = "dbsnp", doc = "dbsnp file", common = true, optional = true)
private File dbsnpFile= null;
@Argument(fullName = "force", shortName = "force", doc = "do not redo if file(s) exist", common = true, optional = true)
private boolean forceRedo =false;
@Argument(fullName = "batch-size", shortName = "batch-size", doc = "batch size", common = true, optional = true)
private int batchSize=50;
@Argument(fullName = StandardArgumentDefinitions.INTERVALS_LONG_NAME, shortName = StandardArgumentDefinitions.INTERVALS_SHORT_NAME, doc = "interval chr:start-end", common = true, optional = false)
private String interval= null;
//private File tmpDirectory = null;
/** return md5 of a string */
public static final String md5(final String in) {
final MessageDigest _md5;
try {
_md5 = java.security.MessageDigest.getInstance("MD5");
} catch (final NoSuchAlgorithmException e) {
throw new RuntimeException("MD5 algorithm not found", e);
}
_md5.reset();
_md5.update(in.getBytes());
String s = new java.math.BigInteger(1, _md5.digest()).toString(16);
if (s.length() != 32) {
final String zeros = "00000000000000000000000000000000";
s = zeros.substring(0, 32 - s.length()) + s;
}
return s;
}
private void execute(final List<String> argv) throws Exception {
final String[] args = argv.toArray(new String[argv.size()]);
LOG.info(getCommandLineName()+":Executing: gatk "+ String.join(" ",argv));
final CommandLineProgram program =
this.setupConfigAndExtractProgram(args,
this.getPackageList(),
this.getClassList(),
this.getCommandLineName()
);
final Object result = Main.runCommandLineProgram(program, args);
if(result==null) return;
if(Boolean.TRUE.equals(result)) return;
LOG.warn("Returned "+ result.getClass());
LOG.error("Result is "+ result);
final Throwable err= (result instanceof Throwable?Throwable.class.cast(result):null);
if(err!=null) {
throw new RuntimeException(err);
}
else
{
throw new RuntimeException("Failure");
}
}
private File haplotypeCaller(final String bam) throws Exception {
LOG.info(bam);
final String outfname = md5(bam+":"+this.interval)+".g.vcf.gz";
final File out = new File(this.baseDir,outfname);
final File outtbi = new File(this.baseDir,outfname+".tbi");
if(!this.forceRedo && out.exists() && outtbi.exists()) {
LOG.info("File already exists:"+out+" for "+bam);
return out;
}
final List<String> cmd= new ArrayList<>();
cmd.add("HaplotypeCaller");
cmd.add("-R");
cmd.add(this.reference.toString());
cmd.add("-I");
cmd.add(bam);
cmd.add("-L");
cmd.add(this.interval);
cmd.add("-ERC");
cmd.add("GVCF");
cmd.add("-O");
cmd.add(out.toString());
cmd.add("--tmp-dir");
cmd.add(this.baseDir.toString());
if(this.dbsnpFile!=null) {
cmd.add("--dbsnp");
cmd.add(this.dbsnpFile.toString());
}
if(this.mapq>0) {
cmd.add("--minimum-mapping-quality");
cmd.add(String.valueOf(this.mapq));
}
execute(cmd);
IOUtil.assertFileIsReadable(out);
IOUtil.assertFileIsReadable(outtbi);
return out;
}
private File dbImport(final List<File> gvcfs) throws Exception {
final String outfname = md5("database:"+this.interval);
final File database = new File(this.baseDir,outfname+".db");
final File dbFlag = new File(this.baseDir,outfname+".db.flag");
if(!this.forceRedo && database.exists() && dbFlag.exists() &&
IOUtil.slurp(dbFlag).equals("0")
) {
LOG.info("Database was already processed:" + database);
return database;
}
if(database.exists() && database.isDirectory()) {
LOG.warn("Deleting old "+database);
IOUtil.deleteDirectoryTree(database);
}
final List<String> cmd = new ArrayList<>();
cmd.add("GenomicsDBImport");
cmd.add("--genomicsdb-workspace-path");
cmd.add(database.toString());
cmd.add("-L");
cmd.add(this.interval);
cmd.add("--tmp-dir");
cmd.add(this.baseDir.toString());
if(this.batchSize>0) {
cmd.add("--batch-size");
cmd.add(String.valueOf(this.batchSize));
if(gvcfs.size()/this.batchSize>100) {
// Use the consolidate flag if more than a hundred batches were used.
cmd.add("--consolidate");
}
}
for(final File gvcf : gvcfs) {
cmd.add("-V");
cmd.add(gvcf.toString());
}
execute(cmd);
IOUtil.assertDirectoryIsReadable(database);
try(PrintWriter pw = new PrintWriter(dbFlag)) {
pw.print("0");
pw.flush();
}
return database;
}
private File genotype(final File database) throws Exception {
final List<String> cmd = new ArrayList<>();
cmd.add("GenotypeGVCFs");
cmd.add("-V");
cmd.add("gendb://"+database.toString());
cmd.add("-O");
cmd.add(this.finalVcf.toString());
cmd.add("-R");
cmd.add(this.reference.toString());
cmd.add("--tmp-dir");
cmd.add(this.baseDir.toString());
if(this.dbsnpFile!=null) {
cmd.add("--dbsnp");
cmd.add(this.dbsnpFile.toString());
}
execute(cmd);
IOUtil.assertFileIsReadable(this.finalVcf);
return this.finalVcf;
}
private int doWork(final String[] args) {
try {
final CommandLineArgumentParser cmdLineParser = new CommandLineArgumentParser(this);
final boolean ret = cmdLineParser.parseArguments(System.err, args);
if (!ret) {
System.err.println(cmdLineParser.usage(false, false));
return -1;
}
//create workdir
if(!this.baseDir.exists() && !this.baseDir.mkdir()) {
LOG.error("Cannot create "+this.baseDir);
return -1;
}
final List<String> bams = IOUtil.slurpLines(this.bamsIn);
if (bams.isEmpty()) {
LOG.error("No Bam was provided");
return -1;
}
final List<File> gvcfs = new ArrayList<>(bams.size());
for(String bam:bams) {
gvcfs.add(haplotypeCaller(bam));
}
final File database = dbImport(gvcfs);
genotype(database);
LOG.info("OK "+ this.finalVcf);
return 0;
}
catch(final Throwable err) {
err.printStackTrace();
return -1;
}
}
@Override
protected String getCommandLineName() {
return this.getClass().getSimpleName();
}
public static void main(final String[] args) {
int ret= new FastGatk().doWork(args);
System.exit(ret);
}
}
#
# Script
#
# real 10m33.301s
# user 2m52.326s
# sys 0m15.430s
# Wrapper
#
# real 4m44.807s
# user 1m38.720s
# sys 0m6.059s
#
all: result1.vcf.gz result2.vcf.gz
result2.vcf.gz: standalone.sh jeter.list
time bash ./standalone.sh
result1.vcf.gz : fastgatk.jar jeter.list
rm -rf TMP1
time java -Djava.io.tmpdir=TMP1 -cp ${GATKJAR}:$< FastGatk \
-I jeter.list --base-dir TMP1 --output $@ \
--reference /home/lindenbaum-p/src/jvarkit/src/test/resources/rotavirus_rf.fa \
-L RF01 2> log.err
jeter.list:
find /home/lindenbaum-p/src/jvarkit/src/test/resources/ -type f -name "S*.bam" > jeter.list
fastgatk.jar: FastGatk.java
rm -rf TMP
mkdir TMP
javac -d TMP -cp ${GATKJAR} -sourcepath . $<
jar cvf $@ -C TMP .
rm -rf TMP
clean:
rm -rf fastgatk.jar TMP1 TMP2 result1.vcf.gz result1.vcf.gz.tbi result2.vcf.gz result2.vcf.gz.tbi
@lindenb
Copy link
Author

lindenb commented Jun 9, 2021

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment