Skip to content

Instantly share code, notes, and snippets.

@bloodbare
Created October 13, 2022 13:07
Show Gist options
  • Save bloodbare/8381bfd0b11d94f2e8592b1101e695b5 to your computer and use it in GitHub Desktop.
Save bloodbare/8381bfd0b11d94f2e8592b1101e695b5 to your computer and use it in GitHub Desktop.
NucliaDB PloneConf 2022 Examples
clear-oss:
rm -rf blob
rm -rf main
rm -rf node
clear-cloud:
rm -rf blobnuclia
rm -rf mainnuclia
rm -rf nodenuclia
nucliadb-cloud:
nucliadb --zone europe-1 --maindb mainnuclia --blob blobnuclia --node nodenuclia --log INFO --key XXX
nucliadb-oss:
nucliadb --maindb main --blob blob --node node --log INFO
# Query Uploaded conference talks on local NucliaDB
# http://loclahost:8080/widget
# Widget will only show fulltext search
from sentence_transformers import SentenceTransformer # type: ignore
def show_result(data):
resources = data.get("resources")
print(" - SEMANTIC")
for sentence in data.get("sentences").get("results"):
print(f"{sentence.get('field')}: {sentence.get('text')}")
print(" - FUZZY")
for paragraph in data.get("paragraphs").get("results"):
print(f"{paragraph.get('field')}: {paragraph.get('text')}")
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
QUERY1 = "How to migrate lots of pages super fast"
QUERY1V = model.encode([QUERY1])[0]
QUERY2 = "moving from a draw to a project"
QUERY2V = model.encode([QUERY2])[0]
QUERY3 = "Compatible framework with actual API"
QUERY3V = model.encode([QUERY3])[0]
# Create NucliaDB Client
from nucliadb_client.client import NucliaDBClient
client = NucliaDBClient(host="localhost", grpc=8030, http=8080, train=8031)
kb = client.get_kb(slug="ploneconf2022")
import requests
resp = requests.post(
f"http://localhost:8080/api/v1/kb/{kb.kbid}/search",
json={
"vector": QUERY1V.tolist(),
"query": QUERY1,
"min_score": 0.2,
"page_size": 1,
},
headers={"X-NUCLIADB-ROLES": "READER"},
)
print(QUERY1)
show_result(resp.json())
print(" ---------- ")
resp = requests.post(
f"http://localhost:8080/api/v1/kb/{kb.kbid}/search",
json={
"vector": QUERY2V.tolist(),
"query": QUERY2,
"min_score": 0.2,
"page_size": 1,
},
headers={"X-NUCLIADB-ROLES": "READER"},
)
print(QUERY2)
show_result(resp.json())
print(" ---------- ")
resp = requests.post(
f"http://localhost:8080/api/v1/kb/{kb.kbid}/search",
json={
"vector": QUERY3V.tolist(),
"query": QUERY3,
"min_score": 0.2,
"page_size": 1,
},
headers={"X-NUCLIADB-ROLES": "READER"},
)
print(QUERY3)
show_result(resp.json())
print(" ---------- ")
# Script to upload a file to NucliaDB using Nuclia.cloud service
# NucliaDB needs to be running with an API key
# You can download the example file at:
# https://www.quintagroup.com/cms/plone/plone_brochure.pdf/@@download/file/Plone_brochure.pdf
# Create NucliaDB Client
from nucliadb_client.client import NucliaDBClient
import requests
import base64
client = NucliaDBClient(host="localhost", grpc=8030, http=8080, train=8031)
kb = client.get_kb(slug="ploneconf2022_nuclia")
if kb is None:
kb = client.create_kb(slug="ploneconf2022_nuclia", title="Plone Conference 2022")
data = open("Plone_brochure.pdf", "rb").read()
resp = requests.post(
f"http://localhost:8080/api/v1/kb/{kb.kbid}/upload",
data=data,
headers={
"X-FILENAME": base64.b64encode("Plone_brochure.pdf".encode()),
"X-NUCLIADB-ROLES": "WRITER",
},
)
print(f"Done created resources")
# Search on widget: http://localhost:8080/widget
# quines bases de dades usa el programari
# SQL injection
# Upload plone conference talks to NucliaDB
from typing import Any, Dict, List
import requests
# LOAD a simple Vectorizer
from sentence_transformers import SentenceTransformer # type: ignore
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
print("Loaded vectorizer")
# Create NucliaDB Client
from nucliadb_client.client import NucliaDBClient
from nucliadb.models import (
CreateResourcePayload,
InputMetadata,
TextField,
UserMetadata,
Classification,
TextFormat,
)
from devtools import debug
from nucliadb.models.labels import LabelSet, Label, LabelSetKind
from nucliadb_protos.resources_pb2 import FieldType
from nucliadb_protos.utils_pb2 import Vector
from bs4 import BeautifulSoup # type: ignore
client = NucliaDBClient(host="localhost", grpc=8030, http=8080, train=8031)
kb = client.get_kb(slug="ploneconf2022")
if kb is None:
kb = client.create_kb(slug="ploneconf2022", title="Plone Conference 2022")
# Plone Conf Audiencies
AUDIENCES = requests.get(
"https://2022.ploneconf.org/++api++/@vocabularies/ploneconf.core.vocabularies.slot_audiences"
).json()
labels = [
Label(title=item.get("title"))
for item in AUDIENCES.get("items", [])
if item.get("title")
]
ls_payload = LabelSet(title="Audiencies", kind=[LabelSetKind.RESOURCES], labels=labels)
debug(ls_payload)
resp = requests.post(
f"http://localhost:8080/api/v1/kb/{kb.kbid}/labelset/audiencies",
headers={"X-NUCLIADB-ROLES": "WRITER"},
json=ls_payload.dict(),
)
assert resp.status_code == 200
print("Uploaded Ontology")
input("Enter to continue")
# Get all talks
DEBUG = False
POSTS: List[Dict[str, Any]] = requests.get(
"https://2022.ploneconf.org/++api++/@talks"
).json()
for post in POSTS:
title = post.get("title")
talk_payload = requests.get(
post.get("@id").replace(
"https://2022.ploneconf.org/", "https://2022.ploneconf.org/++api++/"
)
).json()
if talk_payload is None:
continue
body_text = talk_payload.get("text", {})
if body_text is None:
continue
body = body_text.get("data")
if body is None:
continue
payload = CreateResourcePayload()
payload.title = title
payload.icon = "ploneconf/talk"
payload.metadata = InputMetadata()
payload.metadata.language = "en"
payload.slug = talk_payload.get("id")
payload.usermetadata = UserMetadata()
for audience in talk_payload.get("session_audience"):
payload.usermetadata.classifications.append(
Classification(labelset="audiencies", label=audience.get("title"))
)
field = TextField(body=body)
field.format = TextFormat.HTML
payload.texts["body"] = field
if DEBUG:
debug(payload)
input("Enter to continue")
# Only title is automated indexing
resource = kb.create_resource(payload)
# Now add index information
tree = BeautifulSoup(body, features="html.parser")
good_text = tree.get_text().replace("\n", " \n ")
resource.add_text("body", FieldType.TEXT, good_text) # type: ignore
embeddings = model.encode([title, body])
# Title
vector = Vector(
start=0,
end=len(title),
start_paragraph=0,
end_paragraph=len(title),
)
vector.vector.extend(embeddings[0])
resource.add_vectors(
"title",
FieldType.GENERIC, # type: ignore
[vector],
)
# Body
vector = Vector(
start=0,
end=len(body),
start_paragraph=0,
end_paragraph=len(body),
)
vector.vector.extend(embeddings[1])
resource.add_vectors(
"body",
FieldType.TEXT, # type: ignore
[vector],
)
resource.sync_commit()
print(f"Uploaded {title}")
print(f"Done created {len(POSTS)} resources")
# Search at http://localhost:8080/widget for Victor
sentence_transformers
nucliadb-client
requests
nucliadb
bs4
devtools
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment