Skip to content

Instantly share code, notes, and snippets.

@hsm207
Created February 18, 2023 14:28
Show Gist options
  • Save hsm207/ffb2122acade111c081b120b1db367e3 to your computer and use it in GitHub Desktop.
Save hsm207/ffb2122acade111c081b120b1db367e3 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'hostname': 'http://[::]:8080',\n",
" 'modules': {'ref2vec-centroid': {},\n",
" 'text2vec-transformers': {'model': {'_name_or_path': 'sentence-transformers/msmarco-distilroberta-base-v2',\n",
" 'add_cross_attention': False,\n",
" 'architectures': ['RobertaModel'],\n",
" 'attention_probs_dropout_prob': 0.1,\n",
" 'bad_words_ids': None,\n",
" 'bos_token_id': 0,\n",
" 'chunk_size_feed_forward': 0,\n",
" 'decoder_start_token_id': None,\n",
" 'diversity_penalty': 0,\n",
" 'do_sample': False,\n",
" 'early_stopping': False,\n",
" 'encoder_no_repeat_ngram_size': 0,\n",
" 'eos_token_id': 2,\n",
" 'finetuning_task': None,\n",
" 'forced_bos_token_id': None,\n",
" 'forced_eos_token_id': None,\n",
" 'gradient_checkpointing': False,\n",
" 'hidden_act': 'gelu',\n",
" 'hidden_dropout_prob': 0.1,\n",
" 'hidden_size': 768,\n",
" 'id2label': {'0': 'LABEL_0', '1': 'LABEL_1'},\n",
" 'initializer_range': 0.02,\n",
" 'intermediate_size': 3072,\n",
" 'is_decoder': False,\n",
" 'is_encoder_decoder': False,\n",
" 'label2id': {'LABEL_0': 0, 'LABEL_1': 1},\n",
" 'layer_norm_eps': 1e-05,\n",
" 'length_penalty': 1,\n",
" 'max_length': 20,\n",
" 'max_position_embeddings': 514,\n",
" 'min_length': 0,\n",
" 'model_type': 'roberta',\n",
" 'no_repeat_ngram_size': 0,\n",
" 'num_attention_heads': 12,\n",
" 'num_beam_groups': 1,\n",
" 'num_beams': 1,\n",
" 'num_hidden_layers': 6,\n",
" 'num_return_sequences': 1,\n",
" 'output_attentions': False,\n",
" 'output_hidden_states': False,\n",
" 'output_scores': False,\n",
" 'pad_token_id': 1,\n",
" 'position_embedding_type': 'absolute',\n",
" 'prefix': None,\n",
" 'problem_type': None,\n",
" 'pruned_heads': {},\n",
" 'remove_invalid_values': False,\n",
" 'repetition_penalty': 1,\n",
" 'return_dict': True,\n",
" 'return_dict_in_generate': False,\n",
" 'sep_token_id': None,\n",
" 'task_specific_params': None,\n",
" 'temperature': 1,\n",
" 'tie_encoder_decoder': False,\n",
" 'tie_word_embeddings': True,\n",
" 'tokenizer_class': None,\n",
" 'top_k': 50,\n",
" 'top_p': 1,\n",
" 'torchscript': False,\n",
" 'transformers_version': '4.6.1',\n",
" 'type_vocab_size': 1,\n",
" 'use_bfloat16': False,\n",
" 'use_cache': True,\n",
" 'vocab_size': 50265}}},\n",
" 'version': '1.17.2'}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from weaviate import Client\n",
"\n",
"# Connect to the Weaviate instance at weaviate:8080\n",
"client = Client(\"http://weaviate:8080\")\n",
"\n",
"client.batch.configure(batch_size=1)\n",
"\n",
"# call the meta endpoint\n",
"client.get_meta()\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# delete the Article and Paragraph classes\n",
"client.schema.delete_class(\"Article\")\n",
"client.schema.delete_class(\"Paragraph\")\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# create a class called Paragraph with text property named content\n",
"client.schema.create_class(\n",
" {\n",
" \"class\": \"Paragraph\",\n",
" \"description\": \"A paragraph of text\",\n",
" \"properties\": [\n",
" {\n",
" \"name\": \"content\",\n",
" \"dataType\": [\"text\"],\n",
" \"description\": \"The text of the paragraph\",\n",
" }\n",
" ],\n",
" \"vectorizer\": \"text2vec-transformers\",\n",
" }\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# create a class called Article with string property named title and cross reference to Paragraph\n",
"# vectorize using ref2vec\n",
"client.schema.create_class(\n",
" {\n",
" \"class\": \"Article\",\n",
" \"description\": \"An article\",\n",
" \"properties\": [\n",
" {\n",
" \"name\": \"title\",\n",
" \"dataType\": [\"string\"],\n",
" \"description\": \"The title of the article\",\n",
" },\n",
" {\n",
" \"name\": \"paragraphs\",\n",
" \"dataType\": [\"Paragraph\"],\n",
" \"description\": \"The paragraphs of the article\",\n",
" }\n",
" ],\n",
" \"moduleConfig\": {\n",
" \"ref2vec-centroid\": {\n",
" \"referenceProperties\": [\"paragraphs\"],\n",
" \"method\": \"mean\"\n",
" }\n",
" },\n",
" \"vectorizer\": \"ref2vec-centroid\"\n",
" }\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# create two articles\n",
"# both articles have 2 paragraphs\n",
"# the title for the first article is \"A\"\n",
"# the title for the second article is \"B\"\n",
"# the paragraphs for the first article are [\"lorem ipsum\", \"dolor sit amet\"]\n",
"# the paragraphs for the second article are [\"the quick brown fox\", \"jumps over the lazy dog\"]\n",
"\n",
"articles = {\n",
" \"A\": [\"lorem ipsum\", \"dolor sit amet\"],\n",
" \"B\": [\"the quick brown fox\", \"jumps over the lazy dog\"]\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# create article A in weaviate\n",
"article_title = \"A\"\n",
"article_paragraphs = articles[article_title]\n",
"\n",
"with client.batch as batch:\n",
" article_uuid = batch.add_data_object(\n",
" {\n",
" \"title\": article_title,\n",
" },\n",
" class_name=\"Article\",\n",
" )\n",
"\n",
" for paragraph in article_paragraphs:\n",
" paragraph_uuid = batch.add_data_object(\n",
" {\n",
" \"content\": paragraph,\n",
" },\n",
" class_name=\"Paragraph\",\n",
"\n",
" )\n",
"\n",
" batch.add_reference(\n",
" from_object_uuid=article_uuid,\n",
" from_object_class_name=\"Article\",\n",
" from_property_name=\"paragraphs\",\n",
" to_object_uuid=paragraph_uuid,\n",
" to_object_class_name=\"Paragraph\",\n",
" )\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# create article B in weaviate\n",
"article_title = \"B\"\n",
"article_paragraphs = articles[article_title]\n",
"\n",
"article_uuid = client.data_object.create(\n",
" {\n",
" \"title\": article_title,\n",
" },\n",
" class_name=\"Article\",\n",
")\n",
"\n",
"for paragraph in article_paragraphs:\n",
" paragraph_uuid = client.data_object.create(\n",
" {\n",
" \"content\": paragraph,\n",
" },\n",
" class_name=\"Paragraph\",\n",
" )\n",
"\n",
" client.data_object.reference.add(\n",
" from_uuid=article_uuid,\n",
" from_property_name=\"paragraphs\",\n",
" to_uuid=paragraph_uuid,\n",
" from_class_name=\"Article\",\n",
" to_class_name=\"Paragraph\",\n",
" )\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# get the title and vectors of the articles\n",
"article_A = client.query.get(\"Article\", \"title\")\\\n",
" .with_additional(\"vector\")\\\n",
" .with_where({\n",
" \"path\": [\"title\"],\n",
" \"operator\": \"Equal\",\n",
" \"valueString\": \"A\"\n",
" })\\\n",
" .do()[\"data\"][\"Get\"][\"Article\"]\n",
"\n",
"article_B = client.query.get(\"Article\", \"title\")\\\n",
" .with_additional(\"vector\")\\\n",
" .with_where({\n",
" \"path\": [\"title\"],\n",
" \"operator\": \"Equal\",\n",
" \"valueString\": \"B\"\n",
" })\\\n",
" .do()[\"data\"][\"Get\"][\"Article\"]\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# assert that article A has no vector\n",
"assert not article_A[0][\"_additional\"][\"vector\"]\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# assert that article B has a vector\n",
"assert article_B[0][\"_additional\"][\"vector\"]\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment