Created
January 17, 2021 04:37
-
-
Save argonism/3d187959b420c5ee3a3dfcea07577d2d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# p: posting list | |
# p must be like [docID_1, docID_2, docID_3, ...] | |
def intersect(p1, p2): | |
answer = [] | |
p1_idx = 0 | |
p2_idx = 0 | |
while len(p1) > p1_idx and len(p2) > p2_idx: | |
if p1[p1_idx] == p2[p2_idx]: | |
answer.append(p1[p1_idx]) | |
p1_idx += 1 | |
p2_idx += 1 | |
else: | |
if p1[p1_idx] < p2[p2_idx]: | |
p1_idx += 1 | |
else: | |
p2_idx += 1 | |
return answer | |
reverse_index = { | |
'python': {3: [1, 2, 3]}, | |
'ruby': {6: [1, 3, 4, 6, 7, 11]}, | |
'perl': {5: [1, 4, 5, 6, 12]} | |
} | |
def intersect2(terms): | |
# 文書頻度で昇順ソート | |
def sort_by_increasing_frequency(terms): | |
sorted_terms = sorted(terms, key=lambda term: list(reverse_index[term].keys())[0]) | |
return sorted_terms | |
# 用語のポスティングリストを返す | |
def postings(term): | |
return list(reverse_index[term].values())[0] | |
# 最初の要素 | |
def first(terms): | |
return terms[0] | |
# 最初以外の要素 | |
def rest(terms): | |
return terms[1:] | |
terms = sort_by_increasing_frequency(terms) | |
result = postings(first(terms)) | |
terms = rest(terms) | |
while not terms == [] and not result == []: | |
result = intersect(result, postings(first(terms))) | |
terms = rest(terms) | |
return result | |
if __name__ == "__main__": | |
test_p1 = [1, 2, 3, 6, 7, 20, 44, 32, 59] | |
test_p2 = [1, 2, 5, 7, 8, 10, 20, 45, 59] | |
answer = intersect(test_p1, test_p2) | |
print(answer) | |
# | |
# expected to be [1, 2, 7, 20, 59] | |
# | |
terms = ['python', 'ruby', 'perl'] | |
match_postings = intersect2(terms) | |
print(match_postings) | |
# | |
# expected to be [1] | |
# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment