import synapseclient
from synapseclient import File, Activity, Wiki
syn = synapseclient.login()
DKFZ_FOLDER = 'syn2898426'
WORKFLOW = 'oicr-sga'
DESCRIPTION = 'This is the variant calling for specimen 669a4076-13de-42dc-895c-85d040422042 from donor 05506f4c-e701-4a9d-ae06-97f066aade43. The results consist of one or more VCF files plus optional tar.gz files that contain additional file types. This uses the SangerPancancerCgpCnIndelSnvStr workflow, version 1.0.1 available at This workflow can be created from source, see For a complete change log see Note the 'ANALYSIS_TYPE' is 'REFERENCE_ASSEMBLY' but a better term to describe this analysis is 'SEQUENCE_VARIATION' as defined by the EGA's SRA 1.5 schema. Please note the reference used for alignment was hs37d, see for more information. Briefly this is the integrated reference sequence from the GRCh37 primary assembly (chromosomal plus unlocalized and unplaced contigs), the rCRS mitochondrial sequence (AC:NC_012920), Human herpesvirus 4 type 1 (AC:NC_007605) and the concatenated decoy sequences (hs37d5cs.fa.gz). Variant calls may not be present for all contigs in this reference.'
TITLE = 'TCGA/ICGC PanCancer Donor-Level Variant Calling for Participant 05506f4c-e701-4a9d-ae06-97f066aade4'
date = '20140718'
#Create Provenance log
provenance = Activity(name='DKFZ variant calling v.1.1.0',
desciption='Variant calling for indels, SNVs, and copy numbers based on the DKFZ pipeline',
used = ['', #BRIAN is this the right reference?
'', #GNOS download path for tumor bam #BRIAN will change for each file
'', #GNOS download path for normal bam #BRIAN will change for each file
] #BRIAN any other compoonents that should be explicitly called out?
exectuted = ['', #BRIAN check on the URL
''] #Brian I was unable to find any of the executables that are being run e.g. varscan
prov =
#BRIAN Add for loop over the file you want to add :
suffix = '.snv_mnv.vcf.gz'
path = '/path/to/file/%s.%s_%s.%s.%s' %(sample_id, WORKFLOW, WORLFLOW_VERSION, date, call_type, suffix)
name = '%s.%s.%s.%s.%s' %(sample_id, WORKFLOW, date, call_type, suffix)
#Add metadata to files to be uploaded
f = File(path, name = name, parentId=DKFZ_FOLDER)
f.dataType = 'SNV' #BRIAN change to correct type i.e. SNV, MNV, indel, structural_variation, CNV
f.fileType = 'vcf' #ditto
f.variant_workflow = WORKFLOW
f.variant_workflow_version = WORKFLOW_VERSION
f.call_type = call_type
f.reference_build = 'hs37d'
f.center_name = 'DKFZ'
f.platform="Illumina HiSeq 2000"
f.project_code = 'LAML-US'
f.file_md5 = synapseclient.utils.md5_for_file(path) = 'PCAWG 2.0'
f.original_analysis_id = '3b2c5881-e2a9-4ae9-9abd-bafec7c045f1'
f.dcc_project_code = 'CESC-US
f.sample_id = sample_id
f.submitter_donor_id = '0809ba8b-4ab6-4f43-934c-c1ccbc014a7e'
f.alignment_workflow_name='Workflow_Bundle_BWA (UCSC Implementation)'
f.alignment_workflow_bundle_url = ''
#Store metadata and file to Synapse
f =, activity = provenance)
#Add Description
wiki = synapseclient.Wiki(TITLE, f, DESCRIPTION)
wiki =
