Skip to content

Instantly share code, notes, and snippets.

@leipzig
Created December 8, 2023 07:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save leipzig/639442de640427e883db722a9f8273d0 to your computer and use it in GitHub Desktop.
Save leipzig/639442de640427e883db722a9f8273d0 to your computer and use it in GitHub Desktop.
import tiledb
import tiledbvcf
import numpy as np
cloud_uri = "tiledb://TileDB-Inc/vcf-1kg-dragen-v376"
# This is the config I need to pass
cloud_config = tiledb.Config()
cloud_cfg = tiledbvcf.ReadConfig(tiledb_config=cloud_config)
# Open the VCF dataset in read mode passing the config
ds = tiledbvcf.Dataset(uri=cloud_uri, mode="r", cfg=cloud_cfg)
seenit = {}
newkeys_list = []
samples = ds.samples()
# 500 randomly sampled samples no replacment
random_samples = np.random.choice(samples, 500, replace=False)
for sample in random_samples:
# read and return Pandas dataframe
df = ds.read(
samples=[sample],
regions=["chr21:1-50000000"],
attrs=["contig", "pos_start", "alleles", "fmt_GT"],
)
# add a new column with chr:pos:ref:alt
df["chrposrefalt"] = (
df["contig"].astype(str)
+ ":"
+ df["pos_start"].astype(str)
+ ":"
+ df["alleles"].str[0]
+ ":"
+ df["alleles"].str[1]
)
# count how many new keys we have
newkeys = len(df[~df["chrposrefalt"].isin(seenit)])
# add the new keys to the seenit dict
seenit.update(dict.fromkeys(df["chrposrefalt"].values))
# append newkeys to the list
newkeys_list.append(newkeys)
print(newkeys_list)
#do this in ggplot
#df<-data.frame(x=1:500,y=listofnumbers)
#plot these on the y axis and their corresponding x axis values on the x axis, y axis in log scale, add a smooth line
#label the x axis sample and the y axis number of novel variants
#ggplot(df,aes(x=x,y=y))+geom_point()+scale_y_log10()+geom_smooth(method="loess")+xlab("Sample")+ylab("Number of novel variants")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment