Skip to content

Instantly share code, notes, and snippets.

@jamescalam
Created January 2, 2022 11:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jamescalam/062673282c2a8da13e8084bb7a5bbb35 to your computer and use it in GitHub Desktop.
Save jamescalam/062673282c2a8da13e8084bb7a5bbb35 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from augment import Augmentor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The above `augment` script can be found [here](https://gist.github.com/jamescalam/75badde0c8ad108252497025095623b3)."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"model: medqp\n",
"data: medqp\n",
"data: mrpc\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 3668/3668 [01:40<00:00, 36.67it/s]\n",
" 12%|█▏ | 427/3668 [00:13<01:41, 31.99it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"data: qqp\n",
"data: rte\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 2379/2379 [01:03<00:00, 37.18it/s]\n",
" 69%|██████▊ | 1634/2379 [00:53<00:24, 30.54it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"data: stsb\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 79%|███████▉ | 4319/5436 [01:54<00:29, 37.81it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"model: mrpc\n",
"data: medqp\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1506/1506 [00:37<00:00, 40.66it/s]\n",
"100%|██████████| 1506/1506 [00:41<00:00, 36.05it/s]\n",
" 67%|██████▋ | 1004/1506 [00:30<00:15, 33.08it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"data: mrpc\n",
"data: qqp\n",
"data: rte\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 2379/2379 [01:03<00:00, 37.18it/s]\n",
" 68%|██████▊ | 1624/2379 [00:53<00:24, 30.39it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"data: stsb\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 74%|███████▍ | 4021/5436 [01:48<00:38, 37.22it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"model: qqp\n",
"data: medqp\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1506/1506 [00:37<00:00, 40.03it/s]\n",
"100%|██████████| 1506/1506 [00:42<00:00, 35.58it/s]\n",
" 66%|██████▌ | 991/1506 [00:29<00:15, 33.64it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"data: mrpc\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 3668/3668 [01:37<00:00, 37.69it/s]\n",
" 9%|▉ | 332/3668 [00:10<01:42, 32.56it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"data: qqp\n",
"data: rte\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 2379/2379 [01:02<00:00, 38.35it/s]\n",
" 68%|██████▊ | 1625/2379 [00:51<00:23, 31.56it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"data: stsb\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 74%|███████▎ | 4001/5436 [01:43<00:37, 38.73it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"model: rte\n",
"data: medqp\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1506/1506 [00:35<00:00, 42.19it/s]\n",
"100%|██████████| 1506/1506 [00:40<00:00, 37.23it/s]\n",
" 67%|██████▋ | 1006/1506 [00:29<00:14, 34.16it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"data: mrpc\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 3668/3668 [01:36<00:00, 37.94it/s]\n",
" 9%|▉ | 333/3668 [00:10<01:42, 32.44it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"data: qqp\n",
"data: rte\n",
"data: stsb\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 75%|███████▍ | 4055/5436 [01:44<00:35, 38.74it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"model: stsb\n",
"data: medqp\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1506/1506 [00:35<00:00, 42.12it/s]\n",
"100%|██████████| 1506/1506 [00:40<00:00, 37.06it/s]\n",
" 66%|██████▌ | 992/1506 [00:29<00:15, 34.06it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"data: mrpc\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 3668/3668 [01:36<00:00, 37.86it/s]\n",
" 9%|▉ | 333/3668 [00:10<01:42, 32.39it/s]\n",
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"data: qqp\n",
"data: rte\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 2379/2379 [01:01<00:00, 38.44it/s]\n",
" 68%|██████▊ | 1626/2379 [00:51<00:23, 31.56it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"data: stsb\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\James\\Documents\\projects\\pinecone\\embeddings\\06_aug_sbert\\02_domain_adaption_clean\\augment.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" unlabeled[self.label] = scores\n"
]
}
],
"source": [
"domains = ['medqp', 'mrpc', 'qqp', 'rte', 'stsb']\n",
"for model in domains:\n",
" print(f\"model: {model}\")\n",
" augmentor = Augmentor(f'models/bert-{model}-cross-encoder')\n",
" for data in domains:\n",
" print(f\"data: {data}\")\n",
" if data in ['qqp', model]: continue\n",
" augmentor.load_data(\n",
" f'data/S{model}_T{data}.tsv'\n",
" )\n",
" augmentor.random_sample(20_000)\n",
" augmentor.save_data(\n",
" f'data/aug/S{model}_T{data}.tsv'\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "2ada91ca7be38ac141a70d8e06f4253d3e90604f2701bfa98443d880c4baa087"
},
"kernelspec": {
"display_name": "Python 3.8.8 64-bit ('search': conda)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment