Last active
August 29, 2015 14:10
-
-
Save larssono/7bcb7ef8dc488630f9ab to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import synapseclient | |
from synapseclient import File, Activity, Wiki | |
syn = synapseclient.login() | |
DKFZ_FOLDER = 'syn2898426' | |
WORKFLOW = 'oicr-sga' | |
WORKFLOW_VERSION = '1-0-0' | |
DESCRIPTION = 'This is the variant calling for specimen 669a4076-13de-42dc-895c-85d040422042 from donor 05506f4c-e701-4a9d-ae06-97f066aade43. The results consist of one or more VCF files plus optional tar.gz files that contain additional file types. This uses the SangerPancancerCgpCnIndelSnvStr workflow, version 1.0.1 available at https://s3.amazonaws.com/oicr.workflow.bundles/released-bundles/Workflow_Bundle_SangerPancancerCgpCnIndelSnvStr_1.0.1_SeqWare_1.1.0-alpha.5.zip. This workflow can be created from source, see https://github.com/ICGC-TCGA-PanCancer/SeqWare-CGP-SomaticCore. For a complete change log see https://github.com/testproject/workflow-test-cancer/blob/1.0.0/workflow-test-cancer/CHANGELOG.md. Note the 'ANALYSIS_TYPE' is 'REFERENCE_ASSEMBLY' but a better term to describe this analysis is 'SEQUENCE_VARIATION' as defined by the EGA's SRA 1.5 schema. Please note the reference used for alignment was hs37d, see ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/README_human_reference_20110707 for more information. Briefly this is the integrated reference sequence from the GRCh37 primary assembly (chromosomal plus unlocalized and unplaced contigs), the rCRS mitochondrial sequence (AC:NC_012920), Human herpesvirus 4 type 1 (AC:NC_007605) and the concatenated decoy sequences (hs37d5cs.fa.gz). Variant calls may not be present for all contigs in this reference.' | |
TITLE = 'TCGA/ICGC PanCancer Donor-Level Variant Calling for Participant 05506f4c-e701-4a9d-ae06-97f066aade4' | |
sample_id='7d7205e8-d864-11e3-be46-bd5eb93a18bb' | |
date = '20140718' | |
call_type='somatic' | |
#Create Provenance log | |
provenance = Activity(name='DKFZ variant calling v.1.1.0', | |
desciption='Variant calling for indels, SNVs, and copy numbers based on the DKFZ pipeline', | |
used = ['ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz', #BRIAN is this the right reference? | |
'ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/sv/breakpoint_assemblies.fasta', | |
'http://tcga-data.nci.nih.gov/docs/GAF/GAF_bundle_Feb2011/outputs/TCGA.hg18.Feb2011.gaf', | |
'https://gtrepo-dkfz.annailabs.com/cghub/data/analysis/download/1af1586c-05e9-11e4-86b9-9541c49f5d8e', #GNOS download path for tumor bam #BRIAN will change for each file | |
'https://gtrepo-dkfz.annailabs.com/cghub/data/analysis/download/1aeaa38c-05e9-11e4-86b9-9541c49f5d8e', #GNOS download path for normal bam #BRIAN will change for each file | |
] #BRIAN any other compoonents that should be explicitly called out? | |
exectuted = ['https://github.com/SeqWare/public-workflows/tree/vcf-1.1.0/workflow-DKFZ-bundle', #BRIAN check on the URL | |
''] #Brian I was unable to find any of the executables that are being run e.g. varscan | |
) | |
prov = syn.store(prov) | |
#BRIAN Add for loop over the file you want to add : | |
suffix = '.snv_mnv.vcf.gz' | |
path = '/path/to/file/%s.%s_%s.%s.%s' %(sample_id, WORKFLOW, WORLFLOW_VERSION, date, call_type, suffix) | |
name = '%s.%s.%s.%s.%s' %(sample_id, WORKFLOW, date, call_type, suffix) | |
#Add metadata to files to be uploaded | |
f = File(path, name = name, parentId=DKFZ_FOLDER) | |
f.dataType = 'SNV' #BRIAN change to correct type i.e. SNV, MNV, indel, structural_variation, CNV | |
f.fileType = 'vcf' #ditto | |
f.variant_workflow = WORKFLOW | |
f.variant_workflow_version = WORKFLOW_VERSION | |
f.call_type = call_type | |
f.reference_build = 'hs37d' | |
f.center_name = 'DKFZ' | |
f.platform="Illumina HiSeq 2000" | |
f.sequence_source="WXS" | |
f.project_code = 'LAML-US' | |
f.file_md5 = synapseclient.utils.md5_for_file(path) | |
f.study = 'PCAWG 2.0' | |
f.original_analysis_id = '3b2c5881-e2a9-4ae9-9abd-bafec7c045f1' | |
f.dcc_project_code = 'CESC-US | |
f.sample_id = sample_id | |
f.submitter_donor_id = '0809ba8b-4ab6-4f43-934c-c1ccbc014a7e' | |
f.alignment_workflow_name='Workflow_Bundle_BWA (UCSC Implementation)' | |
f.alignment_workflow_source_url='https://github.com/kellrott/tcga_realign' | |
f.alignment_workflow_version='2.6.0' | |
f.alignment_workflow_bundle_url = 'https://s3.amazonaws.com/oicr.workflow.bundles/released-bundles/Workflow_Bundle_BWA_2.6.0_SeqWare_1.0.15.zip' | |
#Store metadata and file to Synapse | |
f = syn.store(f, activity = provenance) | |
#Add Description | |
wiki = synapseclient.Wiki(TITLE, f, DESCRIPTION) | |
wiki = syn.store(wiki) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment