Created
October 19, 2019 13:46
-
-
Save iSWORD/67075b1b99f9e3120ccd6847dcb36a73 to your computer and use it in GitHub Desktop.
Tumblr audio posts crawler using Tumblr API v1 & Python 3 - API docs at https://www.tumblr.com/docs/en/api/v1#api_read
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import requests, json, sqlite3 | |
from xml.etree import ElementTree as ET | |
from urllib.parse import urlparse, parse_qs | |
blog_domain = 'tracks.tumblr.com' # change this | |
post_type = 'audio' # leave empty for all post types | |
posts_per_request = 50 # max is 50 | |
session = requests.session() | |
db_filename = '%s.db' % blog_domain | |
db_connection = sqlite3.connect(db_filename) | |
db_cursor = db_connection.cursor() | |
db_cursor.execute('''CREATE TABLE IF NOT EXISTS posts | |
(id integer primary key, title text, artist text, album text, avatar text, file_url text)''') | |
start = 0 | |
while True: | |
print('Starting at %d, requesting %d posts' % (start, posts_per_request)) | |
request_url = 'https://%s/api/read/json?num=%d&type=%s&start=%d' % \ | |
(blog_domain, posts_per_request, post_type, start) | |
request = session.get(request_url) | |
response = request.text.replace('var tumblr_api_read = ', '')[:-2] | |
json_response = json.loads(response) | |
posts = json_response['posts'] | |
if len(posts) == 0: | |
break | |
print('Found %d posts' % len(posts)) | |
for post in posts: | |
embed_code = post['audio-embed'] if 'audio-embed' in post else '' | |
iframe = ET.fromstring(embed_code) | |
iframe_src = iframe.attrib.get('src') | |
iframe_src_url = urlparse(iframe_src) | |
iframe_src_query = parse_qs(iframe_src_url.query) | |
file_url = iframe_src_query['audio_file'][0] if 'audio_file' in iframe_src_query else iframe_src | |
post_data = { | |
'id': int(post['id']), | |
'title': post['id3-title'] if 'id3-title' in post else '', | |
'artist': post['id3-artist'] if 'id3-artist' in post else '', | |
'album': post['id3-album'] if 'id3-album' in post else '', | |
'avatar': post['tumblelog']['avatar_url_512'] if 'avatar_url_512' in post['tumblelog'] else '', | |
'file_url': file_url | |
} | |
print(post_data['id']) | |
db_cursor.execute("INSERT OR REPLACE INTO posts VALUES (:id,:title,:artist,:album,:avatar,:file_url)", post_data) | |
start += posts_per_request | |
db_connection.commit() | |
print('All done.') | |
print('Saved to %s' % db_filename) | |
db_connection.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment