Skip to content

Instantly share code, notes, and snippets.

@tice0-2
Created May 16, 2019 17:34
Show Gist options
  • Save tice0-2/9a5f9be98caeb27569c58cd20b2e204b to your computer and use it in GitHub Desktop.
Save tice0-2/9a5f9be98caeb27569c58cd20b2e204b to your computer and use it in GitHub Desktop.
import numpy as np
import math
from collections import Counter
# note that this assumes that all base pairs are upper cased
def kmers(coll, k):
n = len(coll)
for i in range(0, n - k + 1):
yield coll[i:i+k]
def encode(kmer):
v = 0
for i, c in enumerate(kmer):
v += 4 ** i + {'A': 0, 'C': 1, 'T': 2, 'G' : 3}[c]
return v
def mk_vec(coll, k):
v = Counter()
for kmer in kmers(coll, k):
v[kmer] += 1
return v
def euclidean_dist(lhs, rhs):
ks = set(lhs.keys()) | set(rhs.keys())
sm = 0
for k in ks:
delta = lhs[k] - rhs[k]
sm += delta ** 2
return math.sqrt(sm)
def manhattan_dist(lhs, rhs):
ks = set(lhs.keys()) | set(rhs.keys())
sm = 0
for k in ks:
delta = lhs[k] - rhs[k]
sm += abs(delta)
return sm
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment