Created
October 13, 2022 13:07
-
-
Save bloodbare/8381bfd0b11d94f2e8592b1101e695b5 to your computer and use it in GitHub Desktop.
NucliaDB PloneConf 2022 Examples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clear-oss: | |
rm -rf blob | |
rm -rf main | |
rm -rf node | |
clear-cloud: | |
rm -rf blobnuclia | |
rm -rf mainnuclia | |
rm -rf nodenuclia | |
nucliadb-cloud: | |
nucliadb --zone europe-1 --maindb mainnuclia --blob blobnuclia --node nodenuclia --log INFO --key XXX | |
nucliadb-oss: | |
nucliadb --maindb main --blob blob --node node --log INFO |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Query Uploaded conference talks on local NucliaDB | |
# http://loclahost:8080/widget | |
# Widget will only show fulltext search | |
from sentence_transformers import SentenceTransformer # type: ignore | |
def show_result(data): | |
resources = data.get("resources") | |
print(" - SEMANTIC") | |
for sentence in data.get("sentences").get("results"): | |
print(f"{sentence.get('field')}: {sentence.get('text')}") | |
print(" - FUZZY") | |
for paragraph in data.get("paragraphs").get("results"): | |
print(f"{paragraph.get('field')}: {paragraph.get('text')}") | |
model = SentenceTransformer("paraphrase-MiniLM-L6-v2") | |
QUERY1 = "How to migrate lots of pages super fast" | |
QUERY1V = model.encode([QUERY1])[0] | |
QUERY2 = "moving from a draw to a project" | |
QUERY2V = model.encode([QUERY2])[0] | |
QUERY3 = "Compatible framework with actual API" | |
QUERY3V = model.encode([QUERY3])[0] | |
# Create NucliaDB Client | |
from nucliadb_client.client import NucliaDBClient | |
client = NucliaDBClient(host="localhost", grpc=8030, http=8080, train=8031) | |
kb = client.get_kb(slug="ploneconf2022") | |
import requests | |
resp = requests.post( | |
f"http://localhost:8080/api/v1/kb/{kb.kbid}/search", | |
json={ | |
"vector": QUERY1V.tolist(), | |
"query": QUERY1, | |
"min_score": 0.2, | |
"page_size": 1, | |
}, | |
headers={"X-NUCLIADB-ROLES": "READER"}, | |
) | |
print(QUERY1) | |
show_result(resp.json()) | |
print(" ---------- ") | |
resp = requests.post( | |
f"http://localhost:8080/api/v1/kb/{kb.kbid}/search", | |
json={ | |
"vector": QUERY2V.tolist(), | |
"query": QUERY2, | |
"min_score": 0.2, | |
"page_size": 1, | |
}, | |
headers={"X-NUCLIADB-ROLES": "READER"}, | |
) | |
print(QUERY2) | |
show_result(resp.json()) | |
print(" ---------- ") | |
resp = requests.post( | |
f"http://localhost:8080/api/v1/kb/{kb.kbid}/search", | |
json={ | |
"vector": QUERY3V.tolist(), | |
"query": QUERY3, | |
"min_score": 0.2, | |
"page_size": 1, | |
}, | |
headers={"X-NUCLIADB-ROLES": "READER"}, | |
) | |
print(QUERY3) | |
show_result(resp.json()) | |
print(" ---------- ") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script to upload a file to NucliaDB using Nuclia.cloud service | |
# NucliaDB needs to be running with an API key | |
# You can download the example file at: | |
# https://www.quintagroup.com/cms/plone/plone_brochure.pdf/@@download/file/Plone_brochure.pdf | |
# Create NucliaDB Client | |
from nucliadb_client.client import NucliaDBClient | |
import requests | |
import base64 | |
client = NucliaDBClient(host="localhost", grpc=8030, http=8080, train=8031) | |
kb = client.get_kb(slug="ploneconf2022_nuclia") | |
if kb is None: | |
kb = client.create_kb(slug="ploneconf2022_nuclia", title="Plone Conference 2022") | |
data = open("Plone_brochure.pdf", "rb").read() | |
resp = requests.post( | |
f"http://localhost:8080/api/v1/kb/{kb.kbid}/upload", | |
data=data, | |
headers={ | |
"X-FILENAME": base64.b64encode("Plone_brochure.pdf".encode()), | |
"X-NUCLIADB-ROLES": "WRITER", | |
}, | |
) | |
print(f"Done created resources") | |
# Search on widget: http://localhost:8080/widget | |
# quines bases de dades usa el programari | |
# SQL injection |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Upload plone conference talks to NucliaDB | |
from typing import Any, Dict, List | |
import requests | |
# LOAD a simple Vectorizer | |
from sentence_transformers import SentenceTransformer # type: ignore | |
model = SentenceTransformer("paraphrase-MiniLM-L6-v2") | |
print("Loaded vectorizer") | |
# Create NucliaDB Client | |
from nucliadb_client.client import NucliaDBClient | |
from nucliadb.models import ( | |
CreateResourcePayload, | |
InputMetadata, | |
TextField, | |
UserMetadata, | |
Classification, | |
TextFormat, | |
) | |
from devtools import debug | |
from nucliadb.models.labels import LabelSet, Label, LabelSetKind | |
from nucliadb_protos.resources_pb2 import FieldType | |
from nucliadb_protos.utils_pb2 import Vector | |
from bs4 import BeautifulSoup # type: ignore | |
client = NucliaDBClient(host="localhost", grpc=8030, http=8080, train=8031) | |
kb = client.get_kb(slug="ploneconf2022") | |
if kb is None: | |
kb = client.create_kb(slug="ploneconf2022", title="Plone Conference 2022") | |
# Plone Conf Audiencies | |
AUDIENCES = requests.get( | |
"https://2022.ploneconf.org/++api++/@vocabularies/ploneconf.core.vocabularies.slot_audiences" | |
).json() | |
labels = [ | |
Label(title=item.get("title")) | |
for item in AUDIENCES.get("items", []) | |
if item.get("title") | |
] | |
ls_payload = LabelSet(title="Audiencies", kind=[LabelSetKind.RESOURCES], labels=labels) | |
debug(ls_payload) | |
resp = requests.post( | |
f"http://localhost:8080/api/v1/kb/{kb.kbid}/labelset/audiencies", | |
headers={"X-NUCLIADB-ROLES": "WRITER"}, | |
json=ls_payload.dict(), | |
) | |
assert resp.status_code == 200 | |
print("Uploaded Ontology") | |
input("Enter to continue") | |
# Get all talks | |
DEBUG = False | |
POSTS: List[Dict[str, Any]] = requests.get( | |
"https://2022.ploneconf.org/++api++/@talks" | |
).json() | |
for post in POSTS: | |
title = post.get("title") | |
talk_payload = requests.get( | |
post.get("@id").replace( | |
"https://2022.ploneconf.org/", "https://2022.ploneconf.org/++api++/" | |
) | |
).json() | |
if talk_payload is None: | |
continue | |
body_text = talk_payload.get("text", {}) | |
if body_text is None: | |
continue | |
body = body_text.get("data") | |
if body is None: | |
continue | |
payload = CreateResourcePayload() | |
payload.title = title | |
payload.icon = "ploneconf/talk" | |
payload.metadata = InputMetadata() | |
payload.metadata.language = "en" | |
payload.slug = talk_payload.get("id") | |
payload.usermetadata = UserMetadata() | |
for audience in talk_payload.get("session_audience"): | |
payload.usermetadata.classifications.append( | |
Classification(labelset="audiencies", label=audience.get("title")) | |
) | |
field = TextField(body=body) | |
field.format = TextFormat.HTML | |
payload.texts["body"] = field | |
if DEBUG: | |
debug(payload) | |
input("Enter to continue") | |
# Only title is automated indexing | |
resource = kb.create_resource(payload) | |
# Now add index information | |
tree = BeautifulSoup(body, features="html.parser") | |
good_text = tree.get_text().replace("\n", " \n ") | |
resource.add_text("body", FieldType.TEXT, good_text) # type: ignore | |
embeddings = model.encode([title, body]) | |
# Title | |
vector = Vector( | |
start=0, | |
end=len(title), | |
start_paragraph=0, | |
end_paragraph=len(title), | |
) | |
vector.vector.extend(embeddings[0]) | |
resource.add_vectors( | |
"title", | |
FieldType.GENERIC, # type: ignore | |
[vector], | |
) | |
# Body | |
vector = Vector( | |
start=0, | |
end=len(body), | |
start_paragraph=0, | |
end_paragraph=len(body), | |
) | |
vector.vector.extend(embeddings[1]) | |
resource.add_vectors( | |
"body", | |
FieldType.TEXT, # type: ignore | |
[vector], | |
) | |
resource.sync_commit() | |
print(f"Uploaded {title}") | |
print(f"Done created {len(POSTS)} resources") | |
# Search at http://localhost:8080/widget for Victor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sentence_transformers | |
nucliadb-client | |
requests | |
nucliadb | |
bs4 | |
devtools |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment