Skip to content

Instantly share code, notes, and snippets.

@iSWORD
Created October 19, 2019 13:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iSWORD/67075b1b99f9e3120ccd6847dcb36a73 to your computer and use it in GitHub Desktop.
Save iSWORD/67075b1b99f9e3120ccd6847dcb36a73 to your computer and use it in GitHub Desktop.
Tumblr audio posts crawler using Tumblr API v1 & Python 3 - API docs at https://www.tumblr.com/docs/en/api/v1#api_read
#!/usr/bin/env python3
import requests, json, sqlite3
from xml.etree import ElementTree as ET
from urllib.parse import urlparse, parse_qs
blog_domain = 'tracks.tumblr.com' # change this
post_type = 'audio' # leave empty for all post types
posts_per_request = 50 # max is 50
session = requests.session()
db_filename = '%s.db' % blog_domain
db_connection = sqlite3.connect(db_filename)
db_cursor = db_connection.cursor()
db_cursor.execute('''CREATE TABLE IF NOT EXISTS posts
(id integer primary key, title text, artist text, album text, avatar text, file_url text)''')
start = 0
while True:
print('Starting at %d, requesting %d posts' % (start, posts_per_request))
request_url = 'https://%s/api/read/json?num=%d&type=%s&start=%d' % \
(blog_domain, posts_per_request, post_type, start)
request = session.get(request_url)
response = request.text.replace('var tumblr_api_read = ', '')[:-2]
json_response = json.loads(response)
posts = json_response['posts']
if len(posts) == 0:
break
print('Found %d posts' % len(posts))
for post in posts:
embed_code = post['audio-embed'] if 'audio-embed' in post else ''
iframe = ET.fromstring(embed_code)
iframe_src = iframe.attrib.get('src')
iframe_src_url = urlparse(iframe_src)
iframe_src_query = parse_qs(iframe_src_url.query)
file_url = iframe_src_query['audio_file'][0] if 'audio_file' in iframe_src_query else iframe_src
post_data = {
'id': int(post['id']),
'title': post['id3-title'] if 'id3-title' in post else '',
'artist': post['id3-artist'] if 'id3-artist' in post else '',
'album': post['id3-album'] if 'id3-album' in post else '',
'avatar': post['tumblelog']['avatar_url_512'] if 'avatar_url_512' in post['tumblelog'] else '',
'file_url': file_url
}
print(post_data['id'])
db_cursor.execute("INSERT OR REPLACE INTO posts VALUES (:id,:title,:artist,:album,:avatar,:file_url)", post_data)
start += posts_per_request
db_connection.commit()
print('All done.')
print('Saved to %s' % db_filename)
db_connection.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment