Skip to content

Instantly share code, notes, and snippets.

@luisdamed
Last active April 15, 2024 04:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save luisdamed/f3e262971211184f232cea8553729a85 to your computer and use it in GitHub Desktop.
Save luisdamed/f3e262971211184f232cea8553729a85 to your computer and use it in GitHub Desktop.
Collect data from Thingiverse.com REST API. You need to get an authorization token in order to use it - it is free.
#%% Thingiverse_API_runner
# Import libraries and make a first request to get the data about the different categories
from requests import Request, Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import pandas as pd
pd.set_option('display.max_columns', None)
my_token = 'get yours at https://www.thingiverse.com/developers'
url_call = 'categories'
api_url = f"https://api.thingiverse.com/{url_call}"
page = 1
type_call = 'things'
parameters = { 'access_token': my_token}
session = Session()
try:
response = session.get(api_url, params = parameters)
data = response.json()
except (ConnectionError, Timeout, TooManyRedirects) as e:
print(e)
#%% Normalize categories dict
categories = pd.json_normalize(data)
categories['timestamp'] = pd.to_datetime('now')
categories
#%% Define function to get the data from the site for a given category
def categories_runner(url, token, call_type, page_num, category):
# global df
parameters = { 'access_token': token,
'type' : call_type,
# 'posted_before': 'now',
'page' : page_num,
'per_page': 30,
'sort' :'popular'}
session = Session()
try:
response = session.get(url, params = parameters)
data = response.json()
print(data)
except (ConnectionError, Timeout, TooManyRedirects) as e:
print(e)
# Append the dataframes to get a larger one - caution when using large amounts of data
# df2 = pd.json_normalize(data)
# df2['timestamp'] = pd.to_datetime('now')
# df = df.append(df2)
df = pd.json_normalize(data)
df = df.drop(columns = ['url', 'public_url', 'creator.first_name', 'creator.last_name',
'creator.url', 'creator.public_url', 'creator.thumbnail', 'creator.cover', 'tags'])
df['timestamp'] = pd.to_datetime('now')
df['page_num'] = page_num
# df = df.append(df)
if not os.path.isfile(category + '_bypopular.csv'):
df.to_csv(category + '_bypopular.csv', header = 'column_names', sep ='\t')
else:
df.to_csv(category + '_bypopular.csv', mode = 'a', header = False, sep ='\t')
#%% Run the function for each category
import os
from time import time
from time import sleep
url_call = 'categories'
api_url = f"https://api.thingiverse.com/{url_call}"
type_call = 'things'
sleep_time = 2
percent = 1
for ind, category in enumerate(categories['slug']):
cat_url = categories['things_url'][ind]
api_url = f'{cat_url}?access_token={my_token}&per_page=30'
response = session.get(api_url)
max_pages = int(int(pd.json_normalize(response.links)['last.url'][0].rsplit('=', 1)[-1])*percent)
for i in range(1, max_pages + 1):
categories_runner(cat_url, my_token, type_call, i, category)
print(f'Run N°{i}/{max_pages} completed! Going to sleep {sleep_time} seconds')
sleep(sleep_time)
exit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment