Skip to content

Instantly share code, notes, and snippets.

@argonism
Created January 17, 2021 04:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save argonism/3d187959b420c5ee3a3dfcea07577d2d to your computer and use it in GitHub Desktop.
Save argonism/3d187959b420c5ee3a3dfcea07577d2d to your computer and use it in GitHub Desktop.
# p: posting list
# p must be like [docID_1, docID_2, docID_3, ...]
def intersect(p1, p2):
answer = []
p1_idx = 0
p2_idx = 0
while len(p1) > p1_idx and len(p2) > p2_idx:
if p1[p1_idx] == p2[p2_idx]:
answer.append(p1[p1_idx])
p1_idx += 1
p2_idx += 1
else:
if p1[p1_idx] < p2[p2_idx]:
p1_idx += 1
else:
p2_idx += 1
return answer
reverse_index = {
'python': {3: [1, 2, 3]},
'ruby': {6: [1, 3, 4, 6, 7, 11]},
'perl': {5: [1, 4, 5, 6, 12]}
}
def intersect2(terms):
# 文書頻度で昇順ソート
def sort_by_increasing_frequency(terms):
sorted_terms = sorted(terms, key=lambda term: list(reverse_index[term].keys())[0])
return sorted_terms
# 用語のポスティングリストを返す
def postings(term):
return list(reverse_index[term].values())[0]
# 最初の要素
def first(terms):
return terms[0]
# 最初以外の要素
def rest(terms):
return terms[1:]
terms = sort_by_increasing_frequency(terms)
result = postings(first(terms))
terms = rest(terms)
while not terms == [] and not result == []:
result = intersect(result, postings(first(terms)))
terms = rest(terms)
return result
if __name__ == "__main__":
test_p1 = [1, 2, 3, 6, 7, 20, 44, 32, 59]
test_p2 = [1, 2, 5, 7, 8, 10, 20, 45, 59]
answer = intersect(test_p1, test_p2)
print(answer)
#
# expected to be [1, 2, 7, 20, 59]
#
terms = ['python', 'ruby', 'perl']
match_postings = intersect2(terms)
print(match_postings)
#
# expected to be [1]
#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment