Created
June 10, 2018 18:02
-
-
Save lemon24/49930d39d3d336e0efda8218d1e30386 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Scoping for https://github.com/lemon24/reader/issues/67 "Tumblr feeds not working". | |
Output looks like: | |
$ python3 tumblr.py | |
initial None | |
have tumblr redirect | |
{} | |
parse_urllib_for_cookies None | |
{} | |
parse_urllib_only None | |
{'.tumblr.com': {'/': {'pfg': ...}}} | |
parse_requests_for_cookies None | |
{'.tumblr.com': {'/': {'pfg': ...}}} | |
parse_requests_only ORIANART | |
""" | |
import urllib.request | |
import re | |
import json | |
import feedparser | |
import requests | |
url = 'http://fox-orian.tumblr.com/rss' | |
CONSENT_FORM_SVC = 'https://www.tumblr.com/svc/privacy/consent' | |
def make_headers(consent_form_url, tumblr_form_key): | |
return { | |
# anything starting with 'https://www.tumblr.com/' should work | |
'Referer': consent_form_url, | |
'X-tumblr-form-key': tumblr_form_key, | |
} | |
def make_json_data(): | |
return { | |
'eu_resident': True, | |
'gdpr_consent_core': True, | |
'gdpr_consent_first_party_ads': True, | |
'gdpr_consent_search_history': True, | |
'gdpr_consent_third_party_ads': True, | |
'gdpr_is_acceptable_age': True, | |
} | |
def extract_tumblr_form_key(text): | |
match = re.search('<meta name="tumblr-form-key" id="tumblr_form_key" content="([^"]+)">', text) | |
assert match | |
return match.group(1) | |
def fill_cookie_jar_requests(session, consent_form_url): | |
response = session.get(consent_form_url) | |
assert response.status_code == 200 | |
tumblr_form_key = extract_tumblr_form_key(response.text) | |
headers = make_headers(consent_form_url, tumblr_form_key) | |
json_data = make_json_data() | |
response = session.post(CONSENT_FORM_SVC, json=json_data, headers=headers) | |
assert response.status_code == 200 | |
def fill_cookie_jar_urllib(opener, consent_form_url): | |
with opener.open(consent_form_url) as response: | |
text = response.read().decode('utf-8') | |
assert response.status == 200 | |
tumblr_form_key = extract_tumblr_form_key(text) | |
headers = make_headers(consent_form_url, tumblr_form_key) | |
json_data = make_json_data() | |
opener.addheaders.extend(headers.items()) | |
data = json.dumps(json_data).encode('utf-8') | |
with opener.open(CONSENT_FORM_SVC, data) as response: | |
response.read() | |
assert response.status == 200 | |
def parse_urllib_for_cookies(url, consent_form_url): | |
cookie_processor = urllib.request.HTTPCookieProcessor() | |
opener = urllib.request.build_opener(cookie_processor) | |
fill_cookie_jar_urllib(opener, consent_form_url) | |
print(cookie_processor.cookiejar._cookies) | |
return feedparser.parse(url, handlers=[cookie_processor]) | |
def parse_urllib_only(url, consent_form_url): | |
cookie_processor = urllib.request.HTTPCookieProcessor() | |
opener = urllib.request.build_opener(cookie_processor) | |
fill_cookie_jar_urllib(opener, consent_form_url) | |
print(cookie_processor.cookiejar._cookies) | |
with opener.open(url) as response: | |
text = response.read() | |
assert response.status == 200 | |
return feedparser.parse(text.decode('utf-8')) | |
def parse_requests_for_cookies(url, consent_form_url): | |
cookie_processor = urllib.request.HTTPCookieProcessor() | |
session = requests.Session() | |
session.cookies = cookie_processor.cookiejar | |
fill_cookie_jar_requests(session, consent_form_url) | |
print(cookie_processor.cookiejar._cookies) | |
return feedparser.parse(url, handlers=[cookie_processor]) | |
def parse_requests_only(url, consent_form_url): | |
session = requests.Session() | |
fill_cookie_jar_requests(session, consent_form_url) | |
print(session.cookies._cookies) | |
response = session.get(url) | |
assert response.status_code == 200 | |
return feedparser.parse(response.content) | |
f = feedparser.parse(url) | |
print('initial', f.feed.get('title')) | |
if f.bozo_exception: | |
if f.get('status') == 302 and f.get('href', '').startswith('https://www.tumblr.com/privacy/consent'): | |
print('have tumblr redirect') | |
print() | |
consent_form_url = f.href | |
things = [ | |
parse_urllib_for_cookies, | |
parse_urllib_only, | |
parse_requests_for_cookies, | |
parse_requests_only, | |
] | |
for thing in things: | |
f = thing(url, consent_form_url) | |
print(thing.__name__, f.feed.get('title')) | |
print() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment