Skip to content

Instantly share code, notes, and snippets.

@p3t3r67x0
Last active October 29, 2017 15:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save p3t3r67x0/5d535dde9bb79cf9ffddeb644e21fd5c to your computer and use it in GitHub Desktop.
Save p3t3r67x0/5d535dde9bb79cf9ffddeb644e21fd5c to your computer and use it in GitHub Desktop.
Extract mp4 url from ARTE.tv and a proper title
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import json
import requests
from lxml import html
from urlparse import urlparse
from fake_useragent import UserAgent
ua = UserAgent()
def replace_punctuation_mark(text):
return text.replace(u'°', '').replace(',', '').replace('-', '').replace(' ', '_').replace('/', '').replace(':', '').replace('!', '').replace('?', '').replace(' ', '_').replace('.', '').replace('\'', '')
def find_video_url(json_content):
json_content = json.loads(json_content)
content_title = replace_punctuation_mark(json_content['videoJsonPlayer']['VTI'].lower())
try:
content_subtitle = u'_{}'.format(replace_punctuation_mark(json_content['videoJsonPlayer']['subtitle'].lower()))
except KeyError:
content_subtitle = ''
print u'{}{}.mp4'.format(content_title, content_subtitle).replace('__', '_').replace('_.', '.')
return json_content['videoJsonPlayer']['VSR']['HTTPS_SQ_1']['url']
def find_iframe_url(text):
document = html.document_fromstring(text)
target_url = document.xpath('//iframe/@src')[0]
return target_url
def request_content(url, ua_string):
headers = {'User-Agent': ua_string, 'Origin': 'https://www.arte.tv'}
response = requests.get(url, headers = headers)
if response.status_code == 200:
return response.text
def extract_target_json_url(url):
iframe_url = url.replace('json_url=', '').replace('%3A', ':').replace('%3a', ':').replace('%2F', '/').replace('%2f', '/').replace('%3F', '?').replace('%3f', '?').replace('%3D1', '=').replace('%3d1', '=').replace('%26', '&').replace('lifeCycle', 'lifecycle')
target_url = urlparse(iframe_url).query
return target_url
def main():
if len(sys.argv) > 1:
arte_url = sys.argv[1]
else:
print 'Enter the ARTE.tv url you want to download the mp4!'
sys.exit(1)
content = request_content(arte_url, ua.chrome)
iframe_url = find_iframe_url(content)
target_json_url = extract_target_json_url(iframe_url)
json_content = request_content(target_json_url, ua.chrome)
print find_video_url(json_content)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment