This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def minhash(text, window=25): # assume len(text) > 50 | |
hashes = [murmurhash(text[i:i+window]) for i in range(len(text)-window+1)] | |
return set(sorted(hashes)[0:20]) | |
def similarity(text1, text2): | |
hashes1 = minhash(text1) | |
hashes2 = minhash(text2) | |
return len(hashes1 & hashes2) / len(hashes1) | |
A = "one two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
seen_shingleprints = set() | |
for doc in search_results: | |
if any(shingleprint in seen_shingleprints for shingleprint in doc.shingleprints): | |
continue # doc has at least 1 already-seen shingleprint, so skip it | |
final_results.append(doc) | |
seen_shingleprints.update(doc.shingleprints) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def min_max_hashes(text, window=60): | |
hashes = [murmurhash(text[i:i+window]) for i in range(len(text)-window+1)] | |
return [min(hashes), max(hashes)] | |
def shingleprints(text): | |
min1, max1 = min_max_hashes(text[0:len(text)/2]) | |
min2, max2 = min_max_hashes(text[len(text)/2:]) | |
# combine pairs, using your favorite hash-value combiner | |
return [hash_combine(min1, min2), | |
hash_combine(min1, max2), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static string GetPercentageRounds(double percentage) { | |
percentage = Math.Clamp(percentage, 0.0, 1.0); | |
int numFilled = Math.Ceiling(percentage * 10); // A value from 0 to 10 inclusive. | |
const string dots = "🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵⚪⚪⚪⚪⚪⚪⚪⚪⚪⚪"; | |
return dots.Substring(10 - numFilled, 10); | |
} |