Skip to content

Instantly share code, notes, and snippets.

@simon-mo
Created October 5, 2023 17:08
Show Gist options
  • Save simon-mo/7446ef286e3fc938d0e177bc4ea1cdaf to your computer and use it in GitHub Desktop.
Save simon-mo/7446ef286e3fc938d0e177bc4ea1cdaf to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/2335590 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 2335590/2335590 [00:18<00:00, 126098.37it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of lines with 'categories' key starting with 'cs.': 448448/2335590 = 19.20%\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"filename = \"arxiv-metadata-oai-snapshot.json\"\n",
"total_lines = 2335590\n",
"cs_lines = []\n",
"\n",
"with open(filename, \"r\") as f:\n",
" for line in tqdm(f, total=total_lines):\n",
" data = json.loads(line)\n",
" if \"categories\" in data and data[\"categories\"].startswith(\"cs.\"):\n",
" cs_lines.append(data)\n",
"\n",
"print(f\"Number of lines with 'categories' key starting with 'cs.': {len(cs_lines)}/{total_lines} = {len(cs_lines)/total_lines*100:.2f}%\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.DataFrame(cs_lines)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>submitter</th>\n",
" <th>authors</th>\n",
" <th>title</th>\n",
" <th>comments</th>\n",
" <th>journal-ref</th>\n",
" <th>doi</th>\n",
" <th>report-no</th>\n",
" <th>categories</th>\n",
" <th>license</th>\n",
" <th>abstract</th>\n",
" <th>versions</th>\n",
" <th>update_date</th>\n",
" <th>authors_parsed</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0704.0047</td>\n",
" <td>Igor Grabec</td>\n",
" <td>T. Kosel and I. Grabec</td>\n",
" <td>Intelligent location of simultaneously active ...</td>\n",
" <td>5 pages, 5 eps figures, uses IEEEtran.cls</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>cs.NE cs.AI</td>\n",
" <td>None</td>\n",
" <td>The intelligent acoustic emission locator is...</td>\n",
" <td>[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...</td>\n",
" <td>2009-09-29</td>\n",
" <td>[[Kosel, T., ], [Grabec, I., ]]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0704.0050</td>\n",
" <td>Igor Grabec</td>\n",
" <td>T. Kosel and I. Grabec</td>\n",
" <td>Intelligent location of simultaneously active ...</td>\n",
" <td>5 pages, 7 eps figures, uses IEEEtran.cls</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>cs.NE cs.AI</td>\n",
" <td>None</td>\n",
" <td>Part I describes an intelligent acoustic emi...</td>\n",
" <td>[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...</td>\n",
" <td>2007-05-23</td>\n",
" <td>[[Kosel, T., ], [Grabec, I., ]]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0704.0062</td>\n",
" <td>Tom\\'a\\v{s} Vina\\v{r}</td>\n",
" <td>Rastislav \\v{S}r\\'amek, Bro\\v{n}a Brejov\\'a, T...</td>\n",
" <td>On-line Viterbi Algorithm and Its Relationship...</td>\n",
" <td>None</td>\n",
" <td>Algorithms in Bioinformatics: 7th Internationa...</td>\n",
" <td>10.1007/978-3-540-74126-8_23</td>\n",
" <td>None</td>\n",
" <td>cs.DS</td>\n",
" <td>None</td>\n",
" <td>In this paper, we introduce the on-line Vite...</td>\n",
" <td>[{'version': 'v1', 'created': 'Sat, 31 Mar 200...</td>\n",
" <td>2010-01-25</td>\n",
" <td>[[Šrámek, Rastislav, ], [Brejová, Broňa, ], [V...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0704.0090</td>\n",
" <td>Lester Ingber</td>\n",
" <td>Lester Ingber</td>\n",
" <td>Real Options for Project Schedules (ROPS)</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>Report 2007:ROPS</td>\n",
" <td>cs.CE cond-mat.stat-mech cs.MS cs.NA physics.d...</td>\n",
" <td>None</td>\n",
" <td>Real Options for Project Schedules (ROPS) ha...</td>\n",
" <td>[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...</td>\n",
" <td>2007-05-23</td>\n",
" <td>[[Ingber, Lester, ]]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0704.0098</td>\n",
" <td>Jack Raymond</td>\n",
" <td>Jack Raymond, David Saad</td>\n",
" <td>Sparsely-spread CDMA - a statistical mechanics...</td>\n",
" <td>23 pages, 5 figures, figure 1 amended since pu...</td>\n",
" <td>J. Phys. A: Math. Theor. 40 No 41 (12 October ...</td>\n",
" <td>10.1088/1751-8113/40/41/004</td>\n",
" <td>None</td>\n",
" <td>cs.IT math.IT</td>\n",
" <td>None</td>\n",
" <td>Sparse Code Division Multiple Access (CDMA),...</td>\n",
" <td>[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...</td>\n",
" <td>2009-11-13</td>\n",
" <td>[[Raymond, Jack, ], [Saad, David, ]]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id submitter \\\n",
"0 0704.0047 Igor Grabec \n",
"1 0704.0050 Igor Grabec \n",
"2 0704.0062 Tom\\'a\\v{s} Vina\\v{r} \n",
"3 0704.0090 Lester Ingber \n",
"4 0704.0098 Jack Raymond \n",
"\n",
" authors \\\n",
"0 T. Kosel and I. Grabec \n",
"1 T. Kosel and I. Grabec \n",
"2 Rastislav \\v{S}r\\'amek, Bro\\v{n}a Brejov\\'a, T... \n",
"3 Lester Ingber \n",
"4 Jack Raymond, David Saad \n",
"\n",
" title \\\n",
"0 Intelligent location of simultaneously active ... \n",
"1 Intelligent location of simultaneously active ... \n",
"2 On-line Viterbi Algorithm and Its Relationship... \n",
"3 Real Options for Project Schedules (ROPS) \n",
"4 Sparsely-spread CDMA - a statistical mechanics... \n",
"\n",
" comments \\\n",
"0 5 pages, 5 eps figures, uses IEEEtran.cls \n",
"1 5 pages, 7 eps figures, uses IEEEtran.cls \n",
"2 None \n",
"3 None \n",
"4 23 pages, 5 figures, figure 1 amended since pu... \n",
"\n",
" journal-ref \\\n",
"0 None \n",
"1 None \n",
"2 Algorithms in Bioinformatics: 7th Internationa... \n",
"3 None \n",
"4 J. Phys. A: Math. Theor. 40 No 41 (12 October ... \n",
"\n",
" doi report-no \\\n",
"0 None None \n",
"1 None None \n",
"2 10.1007/978-3-540-74126-8_23 None \n",
"3 None Report 2007:ROPS \n",
"4 10.1088/1751-8113/40/41/004 None \n",
"\n",
" categories license \\\n",
"0 cs.NE cs.AI None \n",
"1 cs.NE cs.AI None \n",
"2 cs.DS None \n",
"3 cs.CE cond-mat.stat-mech cs.MS cs.NA physics.d... None \n",
"4 cs.IT math.IT None \n",
"\n",
" abstract \\\n",
"0 The intelligent acoustic emission locator is... \n",
"1 Part I describes an intelligent acoustic emi... \n",
"2 In this paper, we introduce the on-line Vite... \n",
"3 Real Options for Project Schedules (ROPS) ha... \n",
"4 Sparse Code Division Multiple Access (CDMA),... \n",
"\n",
" versions update_date \\\n",
"0 [{'version': 'v1', 'created': 'Sun, 1 Apr 2007... 2009-09-29 \n",
"1 [{'version': 'v1', 'created': 'Sun, 1 Apr 2007... 2007-05-23 \n",
"2 [{'version': 'v1', 'created': 'Sat, 31 Mar 200... 2010-01-25 \n",
"3 [{'version': 'v1', 'created': 'Sun, 1 Apr 2007... 2007-05-23 \n",
"4 [{'version': 'v1', 'created': 'Sun, 1 Apr 2007... 2009-11-13 \n",
"\n",
" authors_parsed \n",
"0 [[Kosel, T., ], [Grabec, I., ]] \n",
"1 [[Kosel, T., ], [Grabec, I., ]] \n",
"2 [[Šrámek, Rastislav, ], [Brejová, Broňa, ], [V... \n",
"3 [[Ingber, Lester, ]] \n",
"4 [[Raymond, Jack, ], [Saad, David, ]] "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"df[\"latest_version\"] = df[\"versions\"].apply(lambda x: x[-1][\"version\"])"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"df = df[~df[\"id\"].str.startswith(\"cs\")] # this filters out (7147 out of 448448)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"df[\"filename\"] = df[\"id\"] + df[\"latest_version\"] + \".pdf\"\n",
"df[\"url\"] = df[\"filename\"].apply(lambda x: f\"https://storage.googleapis.com/arxiv-dataset/arxiv/arxiv/pdf/{x.split('.')[0]}/{x}\")"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"df[\"url\"].apply(lambda x: f'curl -sSo- {x} | pdftotext -q - {x.split(\"/\")[-1].replace(\"pdf\", \"txt\")}').to_csv(\n",
" \"download.sh\", index=False, header=False, sep=\" \"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"df.to_parquet(\"arxiv-cs-metadata.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"with open(\"cs_lines.json\", \"w\") as f:\n",
" for l in cs_lines:\n",
" f.write(json.dumps(l) + \"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 441301 download.sh\n"
]
}
],
"source": [
"!wc -l download.sh"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment