Skip to content

Instantly share code, notes, and snippets.

@sayakpaul
Last active September 30, 2021 01:29
Show Gist options
  • Save sayakpaul/5997eddc26f87cb3a3f53032b46d0536 to your computer and use it in GitHub Desktop.
Save sayakpaul/5997eddc26f87cb3a3f53032b46d0536 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "66abf160",
"metadata": {},
"outputs": [],
"source": [
"import apache_beam as beam\n",
"import arxiv # !pip install arxiv\n",
"\n",
"from apache_beam.dataframe.convert import to_dataframe"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5443193c",
"metadata": {},
"outputs": [],
"source": [
"query_keywords = [\n",
" \"\\\"image segmentation\\\"\",\n",
" \"\\\"self-supervised learning\\\"\",\n",
" \"\\\"representation learning\\\"\",\n",
" \"\\\"image generation\\\"\",\n",
" \"\\\"object detection\\\"\",\n",
" \"\\\"transfer learning\\\"\",\n",
" \"\\\"transformers\\\"\",\n",
" \"\\\"adversarial training\",\n",
" \"\\\"generative adversarial networks\\\"\",\n",
" \"\\\"model compressions\\\"\",\n",
" \"\\\"image segmentation\\\"\",\n",
" \"\\\"few-shot learning\\\"\",\n",
" \"\\\"natural language\\\"\",\n",
" \"\\\"graph\\\"\",\n",
" \"\\\"colorization\\\"\",\n",
" \"\\\"depth estimation\\\"\",\n",
" \"\\\"point cloud\\\"\",\n",
" \"\\\"structured data\\\"\",\n",
" \"\\\"optical flow\\\"\",\n",
" \"\\\"reinforcement learning\\\"\",\n",
" \"\\\"super resolution\\\"\",\n",
" \"\\\"attention\\\"\",\n",
" \"\\\"tabular\\\"\",\n",
" \"\\\"unsupervised learning\\\"\",\n",
" \"\\\"semi-supervised learning\\\"\",\n",
" \"\\\"explainable\\\"\",\n",
" \"\\\"radiance field\\\"\",\n",
" \"\\\"decision tree\\\"\",\n",
" \"\\\"time series\\\"\",\n",
" \"\\\"molecule\\\"\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "83ddf9d6",
"metadata": {},
"outputs": [],
"source": [
"client = arxiv.Client(num_retries=20, page_size=500)\n",
"\n",
"\n",
"def query_with_keywords(query):\n",
" search = arxiv.Search(\n",
" query=query,\n",
" max_results=10, # 20000\n",
" sort_by=arxiv.SortCriterion.LastUpdatedDate,\n",
" )\n",
"\n",
" for res in client.results(search):\n",
" if res.primary_category in [\"cs.CV\", \"stat.ML\", \"cs.LG\"]:\n",
" yield beam.Row(\n",
" terms=res.categories, titles=res.title, abstracts=res.summary\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d58affea",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/sayakpaul/.local/bin/.virtualenvs/tf/lib/python3.8/site-packages/apache_beam/dataframe/io.py:566: FutureWarning: WriteToFiles is experimental.\n",
" return pcoll | fileio.WriteToFiles(\n",
"/Users/sayakpaul/.local/bin/.virtualenvs/tf/lib/python3.8/site-packages/apache_beam/io/fileio.py:535: BeamDeprecationWarning: options is deprecated since First stable release. References to <pipeline>.options will not be supported\n",
" p.options.view_as(GoogleCloudOptions).temp_location or\n",
"WARNING:root:Make sure that locally built Python SDK docker image has Python 3.8 interpreter.\n"
]
}
],
"source": [
"with beam.Pipeline() as pipeline:\n",
" keywords = pipeline | beam.Create(query_keywords[:2])\n",
" records = keywords | beam.FlatMap(query_with_keywords)\n",
" _ = to_dataframe(records).to_csv(\"sample.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "daedd023",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>terms</th>\n",
" <th>titles</th>\n",
" <th>abstracts</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>['cs.CV']</td>\n",
" <td>Inter Extreme Points Geodesics for Weakly Supe...</td>\n",
" <td>We introduce $\\textit{InExtremIS}$, a weakly s...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>['cs.CV']</td>\n",
" <td>UTNet: A Hybrid Transformer Architecture for M...</td>\n",
" <td>Transformer architecture has emerged to be suc...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>['cs.CV']</td>\n",
" <td>mDALU: Multi-Source Domain Adaptation and Labe...</td>\n",
" <td>One challenge of object recognition is to gene...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>['cs.CV', 'cs.AI', 'cs.LG']</td>\n",
" <td>Using Soft Labels to Model Uncertainty in Medi...</td>\n",
" <td>Medical image segmentation is inherently uncer...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>['cs.CV', 'cs.AI']</td>\n",
" <td>Towards to Robust and Generalized Medical Imag...</td>\n",
" <td>To mitigate the radiologist's workload, comput...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" terms \\\n",
"0 ['cs.CV'] \n",
"1 ['cs.CV'] \n",
"2 ['cs.CV'] \n",
"3 ['cs.CV', 'cs.AI', 'cs.LG'] \n",
"4 ['cs.CV', 'cs.AI'] \n",
"\n",
" titles \\\n",
"0 Inter Extreme Points Geodesics for Weakly Supe... \n",
"1 UTNet: A Hybrid Transformer Architecture for M... \n",
"2 mDALU: Multi-Source Domain Adaptation and Labe... \n",
"3 Using Soft Labels to Model Uncertainty in Medi... \n",
"4 Towards to Robust and Generalized Medical Imag... \n",
"\n",
" abstracts \n",
"0 We introduce $\\textit{InExtremIS}$, a weakly s... \n",
"1 Transformer architecture has emerged to be suc... \n",
"2 One challenge of object recognition is to gene... \n",
"3 Medical image segmentation is inherently uncer... \n",
"4 To mitigate the radiologist's workload, comput... "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"\n",
"df = pd.read_csv(\"sample.csv-00000-of-00001\")\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "afd418f4",
"metadata": {},
"source": [
"## Acknowledgements\n",
"\n",
"* [Lukas Schwab](https://github.com/lukasschwab)\n",
"* [Robert Bradshaw](https://www.linkedin.com/in/robert-bradshaw-1b48a07/)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment