Skip to content

Instantly share code, notes, and snippets.

@larssono
Last active August 29, 2015 14:10
Show Gist options
  • Save larssono/7bcb7ef8dc488630f9ab to your computer and use it in GitHub Desktop.
Save larssono/7bcb7ef8dc488630f9ab to your computer and use it in GitHub Desktop.
import synapseclient
from synapseclient import File, Activity, Wiki
syn = synapseclient.login()
DKFZ_FOLDER = 'syn2898426'
WORKFLOW = 'oicr-sga'
WORKFLOW_VERSION = '1-0-0'
DESCRIPTION = 'This is the variant calling for specimen 669a4076-13de-42dc-895c-85d040422042 from donor 05506f4c-e701-4a9d-ae06-97f066aade43. The results consist of one or more VCF files plus optional tar.gz files that contain additional file types. This uses the SangerPancancerCgpCnIndelSnvStr workflow, version 1.0.1 available at https://s3.amazonaws.com/oicr.workflow.bundles/released-bundles/Workflow_Bundle_SangerPancancerCgpCnIndelSnvStr_1.0.1_SeqWare_1.1.0-alpha.5.zip. This workflow can be created from source, see https://github.com/ICGC-TCGA-PanCancer/SeqWare-CGP-SomaticCore. For a complete change log see https://github.com/testproject/workflow-test-cancer/blob/1.0.0/workflow-test-cancer/CHANGELOG.md. Note the 'ANALYSIS_TYPE' is 'REFERENCE_ASSEMBLY' but a better term to describe this analysis is 'SEQUENCE_VARIATION' as defined by the EGA's SRA 1.5 schema. Please note the reference used for alignment was hs37d, see ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/README_human_reference_20110707 for more information. Briefly this is the integrated reference sequence from the GRCh37 primary assembly (chromosomal plus unlocalized and unplaced contigs), the rCRS mitochondrial sequence (AC:NC_012920), Human herpesvirus 4 type 1 (AC:NC_007605) and the concatenated decoy sequences (hs37d5cs.fa.gz). Variant calls may not be present for all contigs in this reference.'
TITLE = 'TCGA/ICGC PanCancer Donor-Level Variant Calling for Participant 05506f4c-e701-4a9d-ae06-97f066aade4'
sample_id='7d7205e8-d864-11e3-be46-bd5eb93a18bb'
date = '20140718'
call_type='somatic'
#Create Provenance log
provenance = Activity(name='DKFZ variant calling v.1.1.0',
desciption='Variant calling for indels, SNVs, and copy numbers based on the DKFZ pipeline',
used = ['ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz', #BRIAN is this the right reference?
'ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/sv/breakpoint_assemblies.fasta',
'http://tcga-data.nci.nih.gov/docs/GAF/GAF_bundle_Feb2011/outputs/TCGA.hg18.Feb2011.gaf',
'https://gtrepo-dkfz.annailabs.com/cghub/data/analysis/download/1af1586c-05e9-11e4-86b9-9541c49f5d8e', #GNOS download path for tumor bam #BRIAN will change for each file
'https://gtrepo-dkfz.annailabs.com/cghub/data/analysis/download/1aeaa38c-05e9-11e4-86b9-9541c49f5d8e', #GNOS download path for normal bam #BRIAN will change for each file
] #BRIAN any other compoonents that should be explicitly called out?
exectuted = ['https://github.com/SeqWare/public-workflows/tree/vcf-1.1.0/workflow-DKFZ-bundle', #BRIAN check on the URL
''] #Brian I was unable to find any of the executables that are being run e.g. varscan
)
prov = syn.store(prov)
#BRIAN Add for loop over the file you want to add :
suffix = '.snv_mnv.vcf.gz'
path = '/path/to/file/%s.%s_%s.%s.%s' %(sample_id, WORKFLOW, WORLFLOW_VERSION, date, call_type, suffix)
name = '%s.%s.%s.%s.%s' %(sample_id, WORKFLOW, date, call_type, suffix)
#Add metadata to files to be uploaded
f = File(path, name = name, parentId=DKFZ_FOLDER)
f.dataType = 'SNV' #BRIAN change to correct type i.e. SNV, MNV, indel, structural_variation, CNV
f.fileType = 'vcf' #ditto
f.variant_workflow = WORKFLOW
f.variant_workflow_version = WORKFLOW_VERSION
f.call_type = call_type
f.reference_build = 'hs37d'
f.center_name = 'DKFZ'
f.platform="Illumina HiSeq 2000"
f.sequence_source="WXS"
f.project_code = 'LAML-US'
f.file_md5 = synapseclient.utils.md5_for_file(path)
f.study = 'PCAWG 2.0'
f.original_analysis_id = '3b2c5881-e2a9-4ae9-9abd-bafec7c045f1'
f.dcc_project_code = 'CESC-US
f.sample_id = sample_id
f.submitter_donor_id = '0809ba8b-4ab6-4f43-934c-c1ccbc014a7e'
f.alignment_workflow_name='Workflow_Bundle_BWA (UCSC Implementation)'
f.alignment_workflow_source_url='https://github.com/kellrott/tcga_realign'
f.alignment_workflow_version='2.6.0'
f.alignment_workflow_bundle_url = 'https://s3.amazonaws.com/oicr.workflow.bundles/released-bundles/Workflow_Bundle_BWA_2.6.0_SeqWare_1.0.15.zip'
#Store metadata and file to Synapse
f = syn.store(f, activity = provenance)
#Add Description
wiki = synapseclient.Wiki(TITLE, f, DESCRIPTION)
wiki = syn.store(wiki)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment