yk-tanigawa/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Example usage of the figshare API

Yosuke Tanigawa
2020/5/22
To upload large files to figshare, one can use figshare API.
This notebook summarizes the basic usage of the figshare API.
We copied the example Python file from the official website, moved some constants to arguments of the function, and saved as a separate function file, figshare_API_misc.py.

  
## figshare_API_misc.py
import hashlib
import json
import os

import requests
from requests.exceptions import HTTPError


def read_token(token_file):
    '''
    We read the access token from a file.

    You can generate a token from https://nih.figshare.com/account/applications
    and save it to a safe place (like /home/users/ytanigaw/.figshare.token.NIH.txt)
    '''
    with open(token_file) as f:
        k = f.read().rstrip()
    return k


# We copied the example Python file from
# https://docs.figshare.com/#upload_files_example_upload_on_figshare
# and save its functions as a separate file


def raw_issue_request(method, url, token, data=None, binary=False):
    headers = {'Authorization': 'token ' + token}
    if data is not None and not binary:
        data = json.dumps(data)
    response = requests.request(method, url, headers=headers, data=data)
    try:
        response.raise_for_status()
        try:
            data = json.loads(response.content)
        except ValueError:
            data = response.content
    except HTTPError as error:
        print('Caught an HTTPError: {}'.format(error.message))
        print('Body:\n', response.content)
        raise

    return data


def issue_request(method, base_URL, endpoint, token, *args, **kwargs):
    return raw_issue_request(method, base_URL.format(endpoint=endpoint), token, *args, **kwargs)


def list_articles(base_URL, token):
    result = issue_request('GET', base_URL, 'account/articles', token)
    print('Listing current articles:')
    if result:
        for item in result:
            print(u'  {url} - {title}'.format(**item))
    else:
        print('  No articles.')
    print('')

def create_article(title, base_URL, token):
    data = {
        'title': title  # You may add any other information about the article here as you wish.
    }
    result = issue_request('POST', base_URL, 'account/articles', token, data=data)
    print('Created article:', result['location'], '\n')

    result = raw_issue_request('GET', result['location'], token)

    return result['id']


def list_files_of_article(base_URL, article_id, token):
    result = issue_request('GET', base_URL, 'account/articles/{}/files'.format(article_id), token)
    print('Listing files for article {}:'.format(article_id))
    if result:
        for item in result:
            print('  {id} - {name}'.format(**item))
    else:
        print('  No files.')
    print('')


def get_file_check_data(file_name, chunk_size):
    with open(file_name, 'rb') as fin:
        md5 = hashlib.md5()
        size = 0
        data = fin.read(chunk_size)
        while data:
            size += len(data)
            md5.update(data)
            data = fin.read(chunk_size)
        return md5.hexdigest(), size


def initiate_new_upload(base_URL, article_id, file_name, token, chunk_size=1048576):
    endpoint = 'account/articles/{}/files'
    endpoint = endpoint.format(article_id)

    md5, size = get_file_check_data(file_name, chunk_size)
    data = {'name': os.path.basename(file_name),
            'md5': md5,
            'size': size}

    result = issue_request('POST', base_URL, endpoint, token, data=data)
    print('Initiated file upload:', result['location'], '\n')

    result = raw_issue_request('GET', result['location'], token)

    return result


def complete_upload(base_URL, article_id, file_id, token):
    issue_request('POST', base_URL, 'account/articles/{}/files/{}'.format(article_id, file_id), token)


def upload_parts(file_info, file_name, token):
    url = '{upload_url}'.format(**file_info)
    result = raw_issue_request('GET', url, token)

    print('Uploading parts:')
    with open(file_name, 'rb') as fin:
        for part in result['parts']:
            upload_part(file_info, fin, part, token)
    print('')


def upload_part(file_info, stream, part, token):
    udata = file_info.copy()
    udata.update(part)
    url = '{upload_url}/{partNo}'.format(**udata)

    stream.seek(part['startOffset'])
    data = stream.read(part['endOffset'] - part['startOffset'] + 1)

    raw_issue_request('PUT', url, token, data=data, binary=True)
    print('  Uploaded part {partNo} from {startOffset} to {endOffset}'.format(**part))

## figshare_API_usage_public.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from figshare_API_misc import *\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# basic usage of figshare API\n",
    "\n",
    "## token for figshare API\n",
    "\n",
    "You can generate a token from https://nih.figshare.com/account/applications\n",
    "and save it to a safe place"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "token_file = '/@@@@@@@@@@@@@@@@@@@/.figshare.token.NIH.txt'\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## define constants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_URL = 'https://api.figshare.com/v2/{endpoint}'\n",
    "token = read_token(token_file)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## show the list of \"articles\" (~= datasets)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Listing current articles:\n",
      "  https://api.figshare.com/v2/account/articles/11369166 - Gene-based test results used for the analysis described in 'Rare protein-altering variants in ANGPTL7 lower intraocular pressure and protect against glaucoma'\n",
      "  https://api.figshare.com/v2/account/articles/11368022 - Genome-wide summary statistics used for the analysis described in 'Rare protein-altering variants in ANGPTL7 lower intraocular pressure and protect against glaucoma'\n",
      "  https://api.figshare.com/v2/account/articles/9202247 - Decomposed matrices used for the analysis described in 'Components of genetic associations across 2,138 phenotypes in the UK Biobank highlight adipocyte biology'\n",
      "\n"
     ]
    }
   ],
   "source": [
    "list_articles(base_URL, token)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## show the list of files in the specified \"article\"\n",
    "\n",
    "You should specify the article ID from the output above"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Listing files for article 9202247:\n",
      "  16758284 - dev_allNonMHC_z_center_p0001_100PCs_20180129.npz\n",
      "  16758281 - dev_codingNonMHC_z_center_p0001_100PCs_20180129.npz\n",
      "  16758278 - dev_PTVsNonMHC_z_center_p0001_100PCs_20180129.npz\n",
      "\n"
     ]
    }
   ],
   "source": [
    "article_id = 9202247\n",
    "\n",
    "list_files_of_article(base_URL, article_id, token)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## upload a file using the figshare API"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # specify an article ID\n",
    "# article_id = -1\n",
    "\n",
    "# # Then we upload the file.\n",
    "# file_info = initiate_new_upload(base_URL, article_id, file_name, token)\n",
    "\n",
    "# # Until here we used the figshare API; following lines use the figshare upload service API.\n",
    "# upload_parts(file_info, file_name, token)\n",
    "\n",
    "# # We return to the figshare API to complete the file upload process.\n",
    "# complete_upload(base_URL, article_id, file_info['id'], token)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	import hashlib
	import json
	import os

	import requests
	from requests.exceptions import HTTPError


	def read_token(token_file):
	'''
	We read the access token from a file.

	You can generate a token from https://nih.figshare.com/account/applications
	and save it to a safe place (like /home/users/ytanigaw/.figshare.token.NIH.txt)
	'''
	with open(token_file) as f:
	k = f.read().rstrip()
	return k


	# We copied the example Python file from
	# https://docs.figshare.com/#upload_files_example_upload_on_figshare
	# and save its functions as a separate file


	def raw_issue_request(method, url, token, data=None, binary=False):
	headers = {'Authorization': 'token ' + token}
	if data is not None and not binary:
	data = json.dumps(data)
	response = requests.request(method, url, headers=headers, data=data)
	try:
	response.raise_for_status()
	try:
	data = json.loads(response.content)
	except ValueError:
	data = response.content
	except HTTPError as error:
	print('Caught an HTTPError: {}'.format(error.message))
	print('Body:\n', response.content)
	raise

	return data


	def issue_request(method, base_URL, endpoint, token, args, *kwargs):
	return raw_issue_request(method, base_URL.format(endpoint=endpoint), token, args, *kwargs)


	def list_articles(base_URL, token):
	result = issue_request('GET', base_URL, 'account/articles', token)
	print('Listing current articles:')
	if result:
	for item in result:
	print(u' {url} - {title}'.format(**item))
	else:
	print(' No articles.')
	print('')

	def create_article(title, base_URL, token):
	data = {
	'title': title # You may add any other information about the article here as you wish.
	}
	result = issue_request('POST', base_URL, 'account/articles', token, data=data)
	print('Created article:', result['location'], '\n')

	result = raw_issue_request('GET', result['location'], token)

	return result['id']


	def list_files_of_article(base_URL, article_id, token):
	result = issue_request('GET', base_URL, 'account/articles/{}/files'.format(article_id), token)
	print('Listing files for article {}:'.format(article_id))
	if result:
	for item in result:
	print(' {id} - {name}'.format(**item))
	else:
	print(' No files.')
	print('')


	def get_file_check_data(file_name, chunk_size):
	with open(file_name, 'rb') as fin:
	md5 = hashlib.md5()
	size = 0
	data = fin.read(chunk_size)
	while data:
	size += len(data)
	md5.update(data)
	data = fin.read(chunk_size)
	return md5.hexdigest(), size


	def initiate_new_upload(base_URL, article_id, file_name, token, chunk_size=1048576):
	endpoint = 'account/articles/{}/files'
	endpoint = endpoint.format(article_id)

	md5, size = get_file_check_data(file_name, chunk_size)
	data = {'name': os.path.basename(file_name),
	'md5': md5,
	'size': size}

	result = issue_request('POST', base_URL, endpoint, token, data=data)
	print('Initiated file upload:', result['location'], '\n')

	result = raw_issue_request('GET', result['location'], token)

	return result


	def complete_upload(base_URL, article_id, file_id, token):
	issue_request('POST', base_URL, 'account/articles/{}/files/{}'.format(article_id, file_id), token)


	def upload_parts(file_info, file_name, token):
	url = '{upload_url}'.format(**file_info)
	result = raw_issue_request('GET', url, token)

	print('Uploading parts:')
	with open(file_name, 'rb') as fin:
	for part in result['parts']:
	upload_part(file_info, fin, part, token)
	print('')


	def upload_part(file_info, stream, part, token):
	udata = file_info.copy()
	udata.update(part)
	url = '{upload_url}/{partNo}'.format(**udata)

	stream.seek(part['startOffset'])
	data = stream.read(part['endOffset'] - part['startOffset'] + 1)

	raw_issue_request('PUT', url, token, data=data, binary=True)
	print(' Uploaded part {partNo} from {startOffset} to {endOffset}'.format(**part))
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"from figshare_API_misc import *\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# basic usage of figshare API\n",
	"\n",
	"## token for figshare API\n",
	"\n",
	"You can generate a token from https://nih.figshare.com/account/applications\n",
	"and save it to a safe place"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"token_file = '/@@@@@@@@@@@@@@@@@@@/.figshare.token.NIH.txt'\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## define constants"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"base_URL = 'https://api.figshare.com/v2/{endpoint}'\n",
	"token = read_token(token_file)\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## show the list of \"articles\" (~= datasets)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Listing current articles:\n",
	" https://api.figshare.com/v2/account/articles/11369166 - Gene-based test results used for the analysis described in 'Rare protein-altering variants in ANGPTL7 lower intraocular pressure and protect against glaucoma'\n",
	" https://api.figshare.com/v2/account/articles/11368022 - Genome-wide summary statistics used for the analysis described in 'Rare protein-altering variants in ANGPTL7 lower intraocular pressure and protect against glaucoma'\n",
	" https://api.figshare.com/v2/account/articles/9202247 - Decomposed matrices used for the analysis described in 'Components of genetic associations across 2,138 phenotypes in the UK Biobank highlight adipocyte biology'\n",
	"\n"
	]
	}
	],
	"source": [
	"list_articles(base_URL, token)\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## show the list of files in the specified \"article\"\n",
	"\n",
	"You should specify the article ID from the output above"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Listing files for article 9202247:\n",
	" 16758284 - dev_allNonMHC_z_center_p0001_100PCs_20180129.npz\n",
	" 16758281 - dev_codingNonMHC_z_center_p0001_100PCs_20180129.npz\n",
	" 16758278 - dev_PTVsNonMHC_z_center_p0001_100PCs_20180129.npz\n",
	"\n"
	]
	}
	],
	"source": [
	"article_id = 9202247\n",
	"\n",
	"list_files_of_article(base_URL, article_id, token)\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## upload a file using the figshare API"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"# # specify an article ID\n",
	"# article_id = -1\n",
	"\n",
	"# # Then we upload the file.\n",
	"# file_info = initiate_new_upload(base_URL, article_id, file_name, token)\n",
	"\n",
	"# # Until here we used the figshare API; following lines use the figshare upload service API.\n",
	"# upload_parts(file_info, file_name, token)\n",
	"\n",
	"# # We return to the figshare API to complete the file upload process.\n",
	"# complete_upload(base_URL, article_id, file_info['id'], token)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}