Skip to content

Instantly share code, notes, and snippets.

@simon-anders
Last active January 14, 2024 19:11
Show Gist options
  • Save simon-anders/1963b2d1e94f8665b69a91253900dd89 to your computer and use it in GitHub Desktop.
Save simon-anders/1963b2d1e94f8665b69a91253900dd89 to your computer and use it in GitHub Desktop.
import gzip, random
# Load FASTQ file for Chromosome 10 from GRCm38
with gzip.open("data/Mus_musculus.GRCm38.dna.chromosome.10.fa.gz") as f:
firstline = f.readline()
assert firstline.startswith(b'>')
chrom_seq = b"".join(l.rstrip() for l in f)
# This here is the file from papagei:mnt/raid/scnmt_data/CpG_filtered
cpg = scipy.sparse.load_npz( "data/CpG_10.npz" ).tocoo()
for i in range(100):
p = random.choice(cpg.row)
s = chrom_seq[ p-2 : p+3 ]
s = s.lower().replace( b"cg", b"CG" )
print(s)
b'aCGga'
b'CGaca'
b'CGaga'
b'aCGct'
b'CGaca'
b'tCGag'
b'tCGtc'
b'CGttt'
b'CGact'
b'CGtgt'
b'CGtga'
b'tCGag'
b'aCGga'
b'aCGtg'
b'CGtga'
b'CGaag'
b'aCGga'
b'aCGct'
b'aCGgt'
b'CGtct'
b'aCGtt'
b'CGtcc'
b'CGtga'
b'CGtcc'
b'aCGtg'
b'tCGat'
b'tCGtc'
b'CGaaa'
b'aCGtg'
b'tCGga'
b'CGtaa'
b'CGacc'
b'tCGct'
b'tCGag'
b'CGtag'
b'CGtgg'
b'aCGcc'
b'tCGaa'
b'CGatg'
b'CGaca'
b'CGtag'
b'CGtgg'
b'CGacc'
b'tCGgc'
b'CGatg'
b'CGtta'
b'tCGta'
b'tCGtg'
b'CGtct'
b'tCGtt'
b'CGttc'
b'CGtgg'
b'CGatc'
b'aCGct'
b'CGaaa'
b'aCGta'
b'CGtgt'
b'tCGtc'
b'aCGgt'
b'aCGga'
b'CGttt'
b'CGtgc'
b'aCGtg'
b'aCGct'
b'CGacc'
b'CGata'
b'CGttt'
b'CGtat'
b'CGtgg'
b'aCGtg'
b'CGact'
b'CGagg'
b'CGaag'
b'aCGag'
b'CGagg'
b'CGtgg'
b'CGtaa'
b'CGtgt'
b'aCGta'
b'CGtct'
b'CGtct'
b'CGaat'
b'CGtgt'
b'tCGgg'
b'tCGct'
b'tCGaa'
b'CGttt'
b'CGatg'
b'tCGaa'
b'CGtat'
b'tCGtc'
b'aCGaa'
b'tCGga'
b'CGagc'
b'CGacc'
b'tCGag'
b'aCGta'
b'aCGtt'
b'aCGtg'
b'CGtat'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment