Last active
April 15, 2024 04:52
-
-
Save luisdamed/f3e262971211184f232cea8553729a85 to your computer and use it in GitHub Desktop.
Collect data from Thingiverse.com REST API. You need to get an authorization token in order to use it - it is free.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#%% Thingiverse_API_runner | |
# Import libraries and make a first request to get the data about the different categories | |
from requests import Request, Session | |
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects | |
import pandas as pd | |
pd.set_option('display.max_columns', None) | |
my_token = 'get yours at https://www.thingiverse.com/developers' | |
url_call = 'categories' | |
api_url = f"https://api.thingiverse.com/{url_call}" | |
page = 1 | |
type_call = 'things' | |
parameters = { 'access_token': my_token} | |
session = Session() | |
try: | |
response = session.get(api_url, params = parameters) | |
data = response.json() | |
except (ConnectionError, Timeout, TooManyRedirects) as e: | |
print(e) | |
#%% Normalize categories dict | |
categories = pd.json_normalize(data) | |
categories['timestamp'] = pd.to_datetime('now') | |
categories | |
#%% Define function to get the data from the site for a given category | |
def categories_runner(url, token, call_type, page_num, category): | |
# global df | |
parameters = { 'access_token': token, | |
'type' : call_type, | |
# 'posted_before': 'now', | |
'page' : page_num, | |
'per_page': 30, | |
'sort' :'popular'} | |
session = Session() | |
try: | |
response = session.get(url, params = parameters) | |
data = response.json() | |
print(data) | |
except (ConnectionError, Timeout, TooManyRedirects) as e: | |
print(e) | |
# Append the dataframes to get a larger one - caution when using large amounts of data | |
# df2 = pd.json_normalize(data) | |
# df2['timestamp'] = pd.to_datetime('now') | |
# df = df.append(df2) | |
df = pd.json_normalize(data) | |
df = df.drop(columns = ['url', 'public_url', 'creator.first_name', 'creator.last_name', | |
'creator.url', 'creator.public_url', 'creator.thumbnail', 'creator.cover', 'tags']) | |
df['timestamp'] = pd.to_datetime('now') | |
df['page_num'] = page_num | |
# df = df.append(df) | |
if not os.path.isfile(category + '_bypopular.csv'): | |
df.to_csv(category + '_bypopular.csv', header = 'column_names', sep ='\t') | |
else: | |
df.to_csv(category + '_bypopular.csv', mode = 'a', header = False, sep ='\t') | |
#%% Run the function for each category | |
import os | |
from time import time | |
from time import sleep | |
url_call = 'categories' | |
api_url = f"https://api.thingiverse.com/{url_call}" | |
type_call = 'things' | |
sleep_time = 2 | |
percent = 1 | |
for ind, category in enumerate(categories['slug']): | |
cat_url = categories['things_url'][ind] | |
api_url = f'{cat_url}?access_token={my_token}&per_page=30' | |
response = session.get(api_url) | |
max_pages = int(int(pd.json_normalize(response.links)['last.url'][0].rsplit('=', 1)[-1])*percent) | |
for i in range(1, max_pages + 1): | |
categories_runner(cat_url, my_token, type_call, i, category) | |
print(f'Run N°{i}/{max_pages} completed! Going to sleep {sleep_time} seconds') | |
sleep(sleep_time) | |
exit() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment