Skip to content

Instantly share code, notes, and snippets.

@radekosmulski
Created November 21, 2023 12:42
Show Gist options
  • Save radekosmulski/524600c280ce3d8ee430dfdfa55abed6 to your computer and use it in GitHub Desktop.
Save radekosmulski/524600c280ce3d8ee430dfdfa55abed6 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "c2c97b02",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"queries = []\n",
"file_path = '/home/rosmulski/datasets/TechQA/training_and_dev/training_Q_A.json'\n",
"with open(file_path, 'r') as file:\n",
" js = json.load(file)\n",
" \n",
"for j in js:\n",
" if 'ANSWERABLE' in j and j['ANSWERABLE'] == 'Y':\n",
" queries.append(j)\n",
" \n",
"file_path = '/home/rosmulski/datasets/TechQA/training_and_dev/dev_Q_A.json'\n",
"with open(file_path, 'r') as file:\n",
" js = json.load(file)\n",
" \n",
"for j in js:\n",
" if 'ANSWERABLE' in j and j['ANSWERABLE'] == 'Y':\n",
" queries.append(j)\n",
" \n",
"file_path = '/home/rosmulski/datasets/TechQA/training_and_dev/training_dev_technotes.json'\n",
"with open(file_path, 'r') as file:\n",
" technotes = json.load(file)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d52f0e5c",
"metadata": {},
"outputs": [],
"source": [
"queries_processed = []\n",
"corpus = []\n",
"doc_id2id = {}\n",
"query_ids = []\n",
"corpus_ids = []\n",
"\n",
"for query in queries:\n",
" if query['DOCUMENT'] not in doc_id2id:\n",
" query_ids.append(len(queries_processed))\n",
" corpus_ids.append(len(corpus))\n",
" queries_processed.append(\n",
" {\n",
" '_id': str(len(queries_processed)),\n",
" 'text': query['QUESTION_TITLE'] + '\\n' + query['QUESTION_TEXT']\n",
" }\n",
" )\n",
" doc_id2id[query['DOCUMENT']] = len(corpus)\n",
" corpus.append(\n",
" {\n",
" '_id': str(len(corpus)),\n",
" 'title': technotes[query['DOCUMENT']]['title'],\n",
" 'text': technotes[query['DOCUMENT']]['text']\n",
" }\n",
" )\n",
" else:\n",
" query_ids.append(len(queries_processed))\n",
" corpus_ids.append(doc_id2id[query['DOCUMENT']])\n",
" queries_processed.append(\n",
" {\n",
" '_id': str(len(queries_processed)),\n",
" 'text': query['QUESTION_TITLE'] + '\\n' + query['QUESTION_TEXT']\n",
" }\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7efad1a4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(610, 610)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(query_ids), len(corpus_ids)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e91965b0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"496"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(corpus)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f245def1",
"metadata": {},
"outputs": [],
"source": [
"# sanity check\n",
"\n",
"for our_corpus_id, q_ours, q_dataset in zip(corpus_ids, queries_processed, queries):\n",
" assert q_dataset['QUESTION_TEXT'] in q_ours['text']\n",
" assert technotes[q_dataset['DOCUMENT']]['text'] == corpus[our_corpus_id]['text']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "7f0f7092",
"metadata": {},
"outputs": [],
"source": [
"additional_kb_docs_BM25_only = []\n",
"additional_doc_ids = set()\n",
"\n",
"for query in queries:\n",
" for doc_id in query['DOC_IDS']:\n",
" if doc_id in doc_id2id or doc_id in additional_doc_ids:\n",
" continue\n",
" else:\n",
" additional_kb_docs_BM25_only.append(\n",
" {\n",
" '_id': str(len(corpus) + len(additional_kb_docs_BM25_only)),\n",
" 'title': technotes[doc_id]['title'],\n",
" 'text': technotes[doc_id]['text']\n",
" }\n",
" )\n",
" additional_doc_ids.add(doc_id)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "112bf257",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"20311"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(additional_doc_ids)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "8b721926",
"metadata": {},
"outputs": [],
"source": [
"file_path = '/home/rosmulski/datasets/TechQA/technote_corpus/full_technote_collection.sections.json'\n",
"with open(file_path, 'r') as file:\n",
" technotes_all = json.load(file)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c1eefce3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"801998"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(technotes_all)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "885c14c2",
"metadata": {},
"outputs": [],
"source": [
"all_other_kb_docs = []\n",
"\n",
"doc_ids_in_corpus = set(doc_id2id.keys()).union(additional_doc_ids)\n",
"for doc_id, kb_doc in technotes_all.items():\n",
" if doc_id in doc_ids_in_corpus:\n",
" continue\n",
" else:\n",
" all_other_kb_docs.append(\n",
" {\n",
" '_id': str(len(corpus) + len(additional_kb_docs_BM25_only) + len(all_other_kb_docs)),\n",
" 'title': kb_doc['title'],\n",
" 'text': kb_doc['text']\n",
" }\n",
" )\n",
" doc_ids_in_corpus.add(kb_doc['id'])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "40d1c09f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"781191"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(all_other_kb_docs)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "48aecd6f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>query-id</th>\n",
" <th>corpus-id</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" query-id corpus-id score\n",
"0 0 0 1\n",
"1 1 1 1\n",
"2 2 2 1\n",
"3 3 3 1\n",
"4 4 4 1"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"qrel_test = pd.DataFrame({'query-id': query_ids, 'corpus-id': corpus_ids, 'score': 1})\n",
"qrel_test.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "3724a030",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>query-id</th>\n",
" <th>corpus-id</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>605</th>\n",
" <td>605</td>\n",
" <td>331</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>606</th>\n",
" <td>606</td>\n",
" <td>108</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>607</th>\n",
" <td>607</td>\n",
" <td>494</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>608</th>\n",
" <td>608</td>\n",
" <td>495</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>609</th>\n",
" <td>609</td>\n",
" <td>58</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" query-id corpus-id score\n",
"605 605 331 1\n",
"606 606 108 1\n",
"607 607 494 1\n",
"608 608 495 1\n",
"609 609 58 1"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"qrel_test.tail()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "b52b42d6",
"metadata": {},
"outputs": [],
"source": [
"!rm -rf /home/rosmulski/datasets/TechQA_beir\n",
"!mkdir /home/rosmulski/datasets/TechQA_beir\n",
"!mkdir /home/rosmulski/datasets/TechQA_beir/kb_retrieved_by_BM25_20k\n",
"!mkdir /home/rosmulski/datasets/TechQA_beir/kb_retrieved_by_BM25_20k/qrels\n",
"!mkdir /home/rosmulski/datasets/TechQA_beir/kb_800k\n",
"!mkdir /home/rosmulski/datasets/TechQA_beir/kb_800k/qrels"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "37db149c",
"metadata": {},
"outputs": [],
"source": [
"for dir_path in ['/home/rosmulski/datasets/TechQA_beir/kb_retrieved_by_BM25_20k/', '/home/rosmulski/datasets/TechQA_beir/kb_800k/']:\n",
" with open(dir_path + 'queries.jsonl', 'w') as file:\n",
" for query in queries_processed:\n",
" json.dump(query, file)\n",
" file.write('\\n')\n",
" qrel_test.to_csv(dir_path + 'qrels/test.tsv', index=False, sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "b5ffc9dd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\"_id\": \"0\", \"text\": \"User environment variables no longer getting picked up after upgrade to 4.1.1.1 or 4.1.1.2?\\n\\n\\nHave you found that after upgrade to Streams 4.1.1.1 or 4.1.1.2, that environment variables set in your .bashrc are no longer being set? For example ODBCINI is not set for the database toolkit and you get\\n\\n An SQL operation failed. The SQL state is 08003, the SQL code\\n is 0 and the SQL message is [unixODBC][Driver\\n Manager]Connnection does not exist.\\n\"}\r\n",
"{\"_id\": \"1\", \"text\": \"Netcool/Impact (all versions): How is the Exit() action function expected to work with User Defined Functions?\\nNetcool/Impact (all versions)\\n\\nUsing the Exit() action function within a User Defined Function in a Policy will not exit the Policy process.\"}\r\n"
]
}
],
"source": [
"!head -n 2 /home/rosmulski/datasets/TechQA_beir/kb_retrieved_by_BM25_20k/queries.jsonl"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "f38f43a2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"query-id\tcorpus-id\tscore\r\n",
"0\t0\t1\r\n",
"1\t1\t1\r\n"
]
}
],
"source": [
"!head -n 3 /home/rosmulski/datasets/TechQA_beir/kb_retrieved_by_BM25_20k/qrels/test.tsv"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "bf8c0b7a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\"_id\": \"0\", \"text\": \"User environment variables no longer getting picked up after upgrade to 4.1.1.1 or 4.1.1.2?\\n\\n\\nHave you found that after upgrade to Streams 4.1.1.1 or 4.1.1.2, that environment variables set in your .bashrc are no longer being set? For example ODBCINI is not set for the database toolkit and you get\\n\\n An SQL operation failed. The SQL state is 08003, the SQL code\\n is 0 and the SQL message is [unixODBC][Driver\\n Manager]Connnection does not exist.\\n\"}\r\n",
"{\"_id\": \"1\", \"text\": \"Netcool/Impact (all versions): How is the Exit() action function expected to work with User Defined Functions?\\nNetcool/Impact (all versions)\\n\\nUsing the Exit() action function within a User Defined Function in a Policy will not exit the Policy process.\"}\r\n"
]
}
],
"source": [
"!head -n 2 /home/rosmulski/datasets/TechQA_beir/kb_800k/queries.jsonl"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "310c9cba",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"query-id\tcorpus-id\tscore\r\n",
"0\t0\t1\r\n",
"1\t1\t1\r\n"
]
}
],
"source": [
"!head -n 3 /home/rosmulski/datasets/TechQA_beir/kb_800k/qrels/test.tsv"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "4cfe87a7",
"metadata": {},
"outputs": [],
"source": [
"dir_path = '/home/rosmulski/datasets/TechQA_beir/kb_retrieved_by_BM25_20k/'\n",
"with open(dir_path + 'corpus.jsonl', 'w') as file:\n",
" for kb_doc in corpus + additional_kb_docs_BM25_only:\n",
" json.dump(kb_doc, file)\n",
" file.write('\\n')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "a6f4ff3a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\"_id\": \"0\", \"title\": \"IBM STREAMS 4.1.1.1 and 4.1.1.2 JOBS DO NOT INHERIT THE ENVIRONMENT VARIABLES SET IN .BASHRC, WHEN STREAMS IS RUN AS A SYSTEM SERVICE - United States\", \"text\": \" FLASH (ALERT)\\n\\nABSTRACT\\n In Streams 4.1.1.1 and 4.1.1.2 Streams jobs may not pick up the user enviroment from the streams user's .bashrc. This behavior is different from earlier releases. With these versions, when Streams is run as a system service, application environment variables must be set with streamtool. \\n\\nCONTENT\\nProblem Description\\nWhen running Streams as a system service with Streams releases 4.1.1.1\\nor 4.1.1.2, toolkits and user applications that depend on the\\nuser environment may have various errors.\\nFor example the database toolkit might show the following error\\nif it does not pick up the ODBCINI environment variable:\\n\\n\\n\\n * \\\"An SQL operation failed. The SQL state is 08003, the SQL code\\n is 0 and the SQL message is [unixODBC][Driver\\n Manager]Connnection does not exist.\\\"\\n * \\n\\nProblem Solution \\n\\nTo work around the issue, set environment variables that are needed by the application directly in the instance with: * \\n \\n * streamtool setproperty\\n * -d <domain> -i <instance>\\n --application-ev <VARIABLE NAME>=<VARIABLE VALUE>\\n * \\n \\n\\n\\nRELATED INFORMATION\\n APAR IT18432 [https://www-01.ibm.com/support/entdocview.wss?uid=swg1IT18432]\"}\r\n"
]
}
],
"source": [
"!head -n 1 /home/rosmulski/datasets/TechQA_beir/kb_retrieved_by_BM25_20k/corpus.jsonl"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "6999d880",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20807\r\n"
]
}
],
"source": [
"!cat /home/rosmulski/datasets/TechQA_beir/kb_retrieved_by_BM25_20k/corpus.jsonl | wc -l"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "fc0a9209",
"metadata": {},
"outputs": [],
"source": [
"dir_path = '/home/rosmulski/datasets/TechQA_beir/kb_800k/'\n",
"with open(dir_path + 'corpus.jsonl', 'w') as file:\n",
" for kb_doc in corpus + additional_kb_docs_BM25_only + all_other_kb_docs:\n",
" json.dump(kb_doc, file)\n",
" file.write('\\n')"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "fae293dc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\"_id\": \"0\", \"title\": \"IBM STREAMS 4.1.1.1 and 4.1.1.2 JOBS DO NOT INHERIT THE ENVIRONMENT VARIABLES SET IN .BASHRC, WHEN STREAMS IS RUN AS A SYSTEM SERVICE - United States\", \"text\": \" FLASH (ALERT)\\n\\nABSTRACT\\n In Streams 4.1.1.1 and 4.1.1.2 Streams jobs may not pick up the user enviroment from the streams user's .bashrc. This behavior is different from earlier releases. With these versions, when Streams is run as a system service, application environment variables must be set with streamtool. \\n\\nCONTENT\\nProblem Description\\nWhen running Streams as a system service with Streams releases 4.1.1.1\\nor 4.1.1.2, toolkits and user applications that depend on the\\nuser environment may have various errors.\\nFor example the database toolkit might show the following error\\nif it does not pick up the ODBCINI environment variable:\\n\\n\\n\\n * \\\"An SQL operation failed. The SQL state is 08003, the SQL code\\n is 0 and the SQL message is [unixODBC][Driver\\n Manager]Connnection does not exist.\\\"\\n * \\n\\nProblem Solution \\n\\nTo work around the issue, set environment variables that are needed by the application directly in the instance with: * \\n \\n * streamtool setproperty\\n * -d <domain> -i <instance>\\n --application-ev <VARIABLE NAME>=<VARIABLE VALUE>\\n * \\n \\n\\n\\nRELATED INFORMATION\\n APAR IT18432 [https://www-01.ibm.com/support/entdocview.wss?uid=swg1IT18432]\"}\r\n"
]
}
],
"source": [
"!head -n 1 /home/rosmulski/datasets/TechQA_beir/kb_800k/corpus.jsonl"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "65107bcf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"801998\r\n"
]
}
],
"source": [
"!cat /home/rosmulski/datasets/TechQA_beir/kb_800k/corpus.jsonl | wc -l"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "344dcaf9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\"_id\": \"0\", \"text\": \"User environment variables no longer getting picked up after upgrade to 4.1.1.1 or 4.1.1.2?\\n\\n\\nHave you found that after upgrade to Streams 4.1.1.1 or 4.1.1.2, that environment variables set in your .bashrc are no longer being set? For example ODBCINI is not set for the database toolkit and you get\\n\\n An SQL operation failed. The SQL state is 08003, the SQL code\\n is 0 and the SQL message is [unixODBC][Driver\\n Manager]Connnection does not exist.\\n\"}\r\n",
"{\"_id\": \"1\", \"text\": \"Netcool/Impact (all versions): How is the Exit() action function expected to work with User Defined Functions?\\nNetcool/Impact (all versions)\\n\\nUsing the Exit() action function within a User Defined Function in a Policy will not exit the Policy process.\"}\r\n",
"{\"_id\": \"2\", \"text\": \"How to configure SSL mutual authentication in IBM HTTP Server?\\n\\n\\nWe are running IHS v7 and our application team has a specific webservice that they require SSL mutual authentication to be enabled for at the http server. Does anyone know how to set this up on IHS? Or any documentation?\\n\\nThanks\\n\"}\r\n"
]
}
],
"source": [
"!head -n 3 /home/rosmulski/datasets/TechQA_beir/kb_800k/queries.jsonl"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment