Skip to content

Instantly share code, notes, and snippets.

View dustinboswell's full-sized avatar

Dustin Boswell dustinboswell

View GitHub Profile
private static string GetPercentageRounds(double percentage) {
percentage = Math.Clamp(percentage, 0.0, 1.0);
int numFilled = Math.Ceiling(percentage * 10); // A value from 0 to 10 inclusive.
const string dots = "🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵⚪⚪⚪⚪⚪⚪⚪⚪⚪⚪";
return dots.Substring(10 - numFilled, 10);
}
@dustinboswell
dustinboswell / shingleprints.py
Last active December 2, 2021 20:15
Computing shingleprints for a document
def min_max_hashes(text, window=60):
hashes = [murmurhash(text[i:i+window]) for i in range(len(text)-window+1)]
return [min(hashes), max(hashes)]
def shingleprints(text):
min1, max1 = min_max_hashes(text[0:len(text)/2])
min2, max2 = min_max_hashes(text[len(text)/2:])
# combine pairs, using your favorite hash-value combiner
return [hash_combine(min1, min2),
hash_combine(min1, max2),
@dustinboswell
dustinboswell / dedup_results.py
Last active November 6, 2020 17:58
Deduplicating a result set using shingleprints
seen_shingleprints = set()
for doc in search_results:
if any(shingleprint in seen_shingleprints for shingleprint in doc.shingleprints):
continue # doc has at least 1 already-seen shingleprint, so skip it
final_results.append(doc)
seen_shingleprints.update(doc.shingleprints)
@dustinboswell
dustinboswell / minhash.py
Last active December 2, 2021 19:55
Rough code for comparing document similarity with MinHash
def minhash(text, window=25): # assume len(text) > 50
hashes = [murmurhash(text[i:i+window]) for i in range(len(text)-window+1)]
return set(sorted(hashes)[0:20])
def similarity(text1, text2):
hashes1 = minhash(text1)
hashes2 = minhash(text2)
return len(hashes1 & hashes2) / len(hashes1)
A = "one two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen"