Liang-Bo Wang ccwang002

## check_possible_ensembl_releases.py
"""Find the possible Ensembl releases of the given IDs.

The script uses Ensembl Tark APIs to subset the possible Ensembl releases
that cover all the given Ensembl IDs. Usually it can pinpoint the right release
using less than 30 IDs. Feeding more IDs may exceed the API call rate limit.

Known issues:
* The API doesn't handle ENSGR (chrY PAR genes)
"""
import argparse

## table1_driver_capture_genes_chol.csv

          
            Gene
            Cancer
            Tumor suppressor or oncogene prediction (by 20/20+)
            Decision
            Tissue Frequency
            Pancan Frequency
            Consensus Score
            Correlation adusted score
            Novel
            Rescue Notes
            Note about previous publication

            
              ARID1A
              CHOL
              
              official
              11.76%
              6.69%
              2.5
              1.80
              0
              
              Found in 28297679

            
              BAP1
              CHOL
              tsg
              official
              17.65%
              2.14%
              3.5
              2.80
              0
              
              Found in 28297679

            
              EPHA2
              CHOL
              tsg
              official
              11.76%
              1.58%
              2.5
              2.50
              0
              
              0

            
              IDH1
              CHOL
              oncogene
              official
              14.71%
              5.56%
              4.5
              3.80
              0
              
              Found in 28297679

            
              PBRM1
              CHOL
              tsg
              official
              17.65%
              3.73%
              3.5
              2.32
              0
              
              0

## gen_barcode.py
from itertools import combinations, product


def gen_pos_sets_to_sub(barcode, max_sub=1):
    """
    Generate all the possible position combinations (sets) within
    the given maximal number of substitutions.

    Examples:

## Snakefile
from pathlib import Path
from snakemake.remote.GS import RemoteProvider as GSRemoteProvider
GS = GSRemoteProvider()


GS_PREFIX = "lbwang-playground/snakemake_rnaseq"
GENOME_FA =  GS.remote(f"{GS_PREFIX}/griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/chr22_ERCC92.fa")
GENOME_GTF = GS.remote(f"{GS_PREFIX}/griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/genes_chr22_ERCC92.gtf")
HISAT2_INDEX_PREFIX = "hisat2_index/chr22_ERCC92"
FULL_HISAT2_INDEX_PREFIX = "dinglab/lbwang/snakemake_demo/hisat2_index/chr22_ERCC92"

## Snakefile
# The Snakefile that loads raw data and genome reference locally
GENOME_FA = "griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/chr22_ERCC92.fa"
GENOME_GTF = "griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/genes_chr22_ERCC92.gtf"
HISAT2_INDEX_PREFIX = "hisat2_index/chr22_ERCC92"

SAMPLES, *_ = glob_wildcards('griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read1.fastq.gz')

from pathlib import Path


## remove_unlink_files_and_empty_dirs.py
import pandas as pd
import os
from pathlib import Path

# Export Zotero library as CSV
ZOTERO_LIBRARY_PTH = '/Users/liang/Desktop/My Library.csv'
REFERENCES_ROOT = Path('/Users/liang/Dropbox/References/')


df = pd.read_csv(ZOTERO_LIBRARY_PTH)

## keybase.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                ccwang002
                / keybase.md
            
            
              Created
              August 16, 2016 21:58
            
          
    Keybase proof

I hereby claim:

I am ccwang002 on github.
I am liang2 (https://keybase.io/liang2) on keybase.
I have a public key ASA_ZHu4l91A5bKbiWrZkL-zyJD9mvEUNQpsaW3LjmvcqQo

To claim this, I am signing this object:

  
## tz_convert.py
from datetime import datetime
from pytz import timezone  # pip install pytz


# Setup remote time
remote_tz = timezone('US/Pacific')                           # PST for example
remote_dt = remote_tz.localize(datetime(2015, 5, 1, 14, 0))  # May 1, 2015 PM2:00 PST

# Setup Taipei local time
tpe = timezone('Asia/Taipei')

## bogus_script.py
import numpy as np
rs = np.random.RandomState(seed=5566)
n_conditions = 10

# Here we simulate a complex computation, for example, analogy of the magnitude
# of gradient decent which expects to be strictly positive. But from the result
# we find that it seems to be sometimes negative, we wish to find out when and
# what condition our program produces bogus ouput.
#
# This is the case to use pdb and condition break point

## 0_Background.md

      
              7 files
            
          
              5 forks
            
          
              0 comments
            
          
              14 stars
            
          
                ccwang002
                / 0_Background.md
            
            
              Last active
              July 31, 2023 02:58
            
              
                Lab Coding Instructions for Beginners
              
          
    [TOC]
Lab Guide for Coding Beginners

亮亮（@ccwang002）| Mar, 2015 | CC 3.0 BY license
如果內容有誤，你可以用任何管道發訊息轟炸我，或用底下的 gist comment 留言。
學習方式

每個檔案都會是一個主題，主題底下會列出一些資源。資源的最後會有一個學習目標，方便讓你評估自己學到什麼程度。學習目標會給一個明確的任務，我盡量讓它能跟（宅宅的）日常生活結合。通常只要完成前一、二個目標就行了，這也不是功課所以不一定要給我看。如果你不介意給我看，我會分享我主觀的建議，但大部份的任務是沒有絕對的正確答案。只要能解決問題都是好方法。
	"""Find the possible Ensembl releases of the given IDs.

	The script uses Ensembl Tark APIs to subset the possible Ensembl releases
	that cover all the given Ensembl IDs. Usually it can pinpoint the right release
	using less than 30 IDs. Feeding more IDs may exceed the API call rate limit.

	Known issues:
	* The API doesn't handle ENSGR (chrY PAR genes)
	"""
	import argparse
Gene	Cancer	Tumor suppressor or oncogene prediction (by 20/20+)	Decision	Tissue Frequency	Pancan Frequency	Consensus Score	Correlation adusted score	Novel	Rescue Notes	Note about previous publication
ARID1A	CHOL		official	11.76%	6.69%	2.5	1.80	0		Found in 28297679
BAP1	CHOL	tsg	official	17.65%	2.14%	3.5	2.80	0		Found in 28297679
EPHA2	CHOL	tsg	official	11.76%	1.58%	2.5	2.50	0		0
IDH1	CHOL	oncogene	official	14.71%	5.56%	4.5	3.80	0		Found in 28297679
PBRM1	CHOL	tsg	official	17.65%	3.73%	3.5	2.32	0		0
	from itertools import combinations, product


	def gen_pos_sets_to_sub(barcode, max_sub=1):
	"""
	Generate all the possible position combinations (sets) within
	the given maximal number of substitutions.

	Examples:
	from pathlib import Path
	from snakemake.remote.GS import RemoteProvider as GSRemoteProvider
	GS = GSRemoteProvider()


	GS_PREFIX = "lbwang-playground/snakemake_rnaseq"
	GENOME_FA = GS.remote(f"{GS_PREFIX}/griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/chr22_ERCC92.fa")
	GENOME_GTF = GS.remote(f"{GS_PREFIX}/griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/genes_chr22_ERCC92.gtf")
	HISAT2_INDEX_PREFIX = "hisat2_index/chr22_ERCC92"
	FULL_HISAT2_INDEX_PREFIX = "dinglab/lbwang/snakemake_demo/hisat2_index/chr22_ERCC92"
	# The Snakefile that loads raw data and genome reference locally
	GENOME_FA = "griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/chr22_ERCC92.fa"
	GENOME_GTF = "griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/genes_chr22_ERCC92.gtf"
	HISAT2_INDEX_PREFIX = "hisat2_index/chr22_ERCC92"

	SAMPLES, *_ = glob_wildcards('griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read1.fastq.gz')

	from pathlib import Path
	import pandas as pd
	import os
	from pathlib import Path

	# Export Zotero library as CSV
	ZOTERO_LIBRARY_PTH = '/Users/liang/Desktop/My Library.csv'
	REFERENCES_ROOT = Path('/Users/liang/Dropbox/References/')


	df = pd.read_csv(ZOTERO_LIBRARY_PTH)
	from datetime import datetime
	from pytz import timezone # pip install pytz


	# Setup remote time
	remote_tz = timezone('US/Pacific') # PST for example
	remote_dt = remote_tz.localize(datetime(2015, 5, 1, 14, 0)) # May 1, 2015 PM2:00 PST

	# Setup Taipei local time
	tpe = timezone('Asia/Taipei')
	import numpy as np
	rs = np.random.RandomState(seed=5566)
	n_conditions = 10

	# Here we simulate a complex computation, for example, analogy of the magnitude
	# of gradient decent which expects to be strictly positive. But from the result
	# we find that it seems to be sometimes negative, we wish to find out when and
	# what condition our program produces bogus ouput.
	#
	# This is the case to use pdb and condition break point