Skip to content

Instantly share code, notes, and snippets.

@ZanSara
Last active December 10, 2021 11:02
Show Gist options
  • Save ZanSara/bb3a5e57129a5b0718dcf8b0666b6e76 to your computer and use it in GitHub Desktop.
Save ZanSara/bb3a5e57129a5b0718dcf8b0666b6e76 to your computer and use it in GitHub Desktop.
Add links to Wikipedia to crawled pages
import os
from time import sleep
from pprint import pprint
import requests
from pathlib import Path
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack.utils import launch_es
from haystack.nodes import DensePassageRetriever
URL_PREFIX = "https://en.wikipedia.org/wiki/"
def compute_meta(filename):
url = filename
# Parenthesis
for unit in ["city", "town", "country", "Keeling"]:
url = url.replace(f"_{unit}_", f"({unit})")
# Genitives
url = url.replace("_s_", "'s_")
# Commas
url = url.replace("__", ",_")
# Initial number
url = "_".join(url.split("_")[1:])
# Extension
url = ".".join(url.split(".")[:-1])
# Exception
if url == "N_Djamena":
url = "N'Djamena"
# Page title
title = url.replace("_", " ") + " (Wikipedia)"
# URL
url = URL_PREFIX + url
# Validate
response = requests.get(url)
print(".", end="", flush=True)
#print(title.ljust(36, " "), url.ljust(36, " "), response.status_code)
if response.status_code >= 400:
return {"name": filename}
return {"name": filename, "url": url, "title": title}
def main():
#launch_es()
#print("Waiting 30s for Elasticsearch to start...")
#sleep(30)
# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
print("\nUpdating docs:")
new_docs = []
old_docs = document_store.get_all_documents()
old_docs.sort(key=lambda x: x.meta["name"].split("_")[1])
meta = {"name": ""}
for doc in old_docs:
if meta["name"] != doc.meta["name"]:
print("\n", doc.meta["name"], end="", flush=True)
meta = compute_meta(doc.meta["name"])
doc.meta = meta
new_docs.append(doc)
print(".", end="", flush=True)
print("\nDeleting all docs...")
document_store.delete_documents()
print("Writing docs...")
document_store.write_documents(new_docs)
retriever = DensePassageRetriever(document_store=document_store,
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
max_seq_len_query=64,
max_seq_len_passage=256,
batch_size=2,
use_gpu=True,
embed_title=True,
use_fast_tokenizers=True
)
document_store.update_embeddings(retriever)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment