Last active
October 14, 2015 19:08
-
-
Save justinlittman/a46ab82f456423a71e39 to your computer and use it in GitHub Desktop.
Recording Flickr API calls to WARC using httplib debugging
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WARC/1.0 | |
WARC-Type: request | |
Content-Length: 528 | |
WARC-Date: 2015-10-14T18:50:56Z | |
WARC-Payload-Digest: sha1:2af4ebafc68cda47bc56df9047c5d46457690d54 | |
WARC-Target-URI: https://api.flickr.com/services/rest/?photo_id=16610484049&secret=ee80d9ecdc&nojsoncallback=1&method=flickr.photos.getInfo&format=json | |
Content-Type: application/http; msgtype=request | |
WARC-Record-ID: <urn:uuid:80ae5e4c-72a4-11e5-a45f-2cf0ee020fec> | |
POST /services/rest/?photo_id=16610484049&secret=ee80d9ecdc&nojsoncallback=1&method=flickr.photos.getInfo&format=json HTTP/1.1 | |
Host: api.flickr.com | |
Content-Length: 0 | |
Accept-Encoding: gzip, deflate | |
Accept: */* | |
User-Agent: python-requests/2.8.1 | |
Connection: close | |
Authorization: OAuth oauth_nonce="53352062120420734791444848655", oauth_timestamp="1444848655", oauth_version="1.0", oauth_signature_method="HMAC-SHA1", oauth_consumer_key="abddfe6fb8bba36e8ef0278ec65dbbc8", oauth_signature="k82HBMNR23WIiezIqopt4me8YYk%3D" | |
WARC/1.0 | |
WARC-Type: response | |
Content-Length: 1743 | |
WARC-Date: 2015-10-14T18:50:56Z | |
WARC-Payload-Digest: sha1:0e48c8a05ab98e0da27bf85699bacf80d951ad8e | |
WARC-Target-URI: https://api.flickr.com/services/rest/?photo_id=16610484049&secret=ee80d9ecdc&nojsoncallback=1&method=flickr.photos.getInfo&format=json | |
Content-Type: application/http; msgtype=response | |
WARC-Record-ID: <urn:uuid:80b4b563-72a4-11e5-9168-2cf0ee020fec> | |
HTTP/1.1 200 OK | |
Date: Wed, 14 Oct 2015 18:50:56 GMT | |
Content-Type: application/json | |
Content-Length: 542 | |
P3P: policyref="https://policies.yahoo.com/w3c/p3p.xml", CP="CAO DSP COR CUR ADM DEV TAI PSA PSD IVAi IVDi CONi TELo OTPi OUR DELi SAMi OTRi UNRi PUBi IND PHY ONL UNI PUR FIN COM NAV INT DEM CNT STA POL HEA PRE LOC GOV" | |
Cache-Control: private | |
X-Served-By: www304.flickr.bf1.yahoo.com | |
Vary: Accept-Encoding | |
Content-Encoding: gzip | |
Age: 0 | |
Via: http/1.1 fts128.flickr.bf1.yahoo.com (ApacheTrafficServer [cMsSf ]), http/1.1 r01.ycpi.nyc.yahoo.net (ApacheTrafficServer [cMsSf ]) | |
Server: ATS | |
Connection: close | |
Y-Trace: BAEAQAAAAAAuXNR7XErSagAAAAAAAAAAxGSLpp1Ayb0AAAAAAAAAAAAFIhUK60oqAAUiFQrtGCuFIsgnAAAAAA-- | |
{"photo":{"id":"16610484049","secret":"ee80d9ecdc","server":"8751","farm":9,"dateuploaded":"1426191780","isfavorite":0,"license":"0","safety_level":"0","rotation":0,"originalsecret":"d205cb161b","originalformat":"jpg","owner":{"nsid":"131866249@N02","username":"justin.littman","realname":"Justin Littman","location":"","iconserver":"0","iconfarm":0,"path_alias":null},"title":{"_content":"IMG_7599"},"description":{"_content":""},"visibility":{"ispublic":1,"isfriend":0,"isfamily":0},"dates":{"posted":"1426191780","taken":"2013-03-29 15:20:10","takengranularity":"0","takenunknown":"0","lastupdate":"1426251618"},"views":"0","editability":{"cancomment":0,"canaddmeta":0},"publiceditability":{"cancomment":1,"canaddmeta":0},"usage":{"candownload":1,"canblog":0,"canprint":0,"canshare":1},"comments":{"_content":"0"},"notes":{"note":[]},"people":{"haspeople":0},"tags":{"tag":[]},"urls":{"url":[{"type":"photopage","_content":"https:\/\/www.flickr.com\/photos\/131866249@N02\/16610484049\/"}]},"media":"photo"},"stat":"ok"} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import flickrapi | |
import StringIO | |
import sys | |
import warc as ia_warc | |
import httplib | |
def wrap_api_call(exec_func, debuggable, base_url): | |
#Execute the method and capture the output | |
raw_resp, capture_out = wrap_execute(exec_func, debuggable) | |
#Parse the captured output | |
http_headers = parse_capture(capture_out) | |
assert len(http_headers) == 2 | |
#Reconstruct the url | |
url = base_url + parse_url(http_headers[0]) | |
#Create warc records | |
request_record = to_warc_record("request", url, http_header=http_headers[0]) | |
#Create response record | |
response_record = to_warc_record("response", url, http_body=raw_resp, http_header=http_headers[1]) | |
return request_record, response_record, raw_resp | |
def wrap_execute(exec_func, debuggable): | |
""" | |
Enables debugging on debuggable, calls an API function, and captures output. | |
""" | |
#When debuglevel is set httplib outputs details to stdout. | |
#This captures stdout. | |
capture_out = StringIO.StringIO() | |
sys.stdout = capture_out | |
#sys.stdout = Tee([capture_out, sys.__stdout__]) | |
debuggable.debuglevel = 1 | |
try: | |
return_values = exec_func() | |
finally: | |
#Stop capturing stdout | |
sys.stdout = sys.__stdout__ | |
debuggable.debuglevel = 0 | |
return return_values, capture_out | |
def parse_capture(capture_out): | |
""" | |
Transform the captured stdout into a series of request and response headers | |
""" | |
http_headers = [] | |
#Reset to the beginning of capture_out | |
capture_out.seek(0) | |
response_header = None | |
for line in capture_out: | |
if line.startswith("send:"): | |
#Push last req and resp | |
if response_header: | |
#Response record | |
http_headers.append(response_header) | |
response_header = None | |
start = line.find("GET") | |
if start == -1: | |
start = line.find("POST") | |
assert start != -1 | |
request_header = line[start:-2].replace("\\r\\n", "\r\n") | |
#Request record | |
http_headers.append(request_header) | |
elif line.startswith("reply:"): | |
#Start of the response header | |
response_header = line[8:-6] + "\r\n" | |
elif line.startswith("header:"): | |
#Append additional headers to response header | |
response_header += line[8:-2] + "\r\n" | |
#Push the last response | |
http_headers.append(response_header) | |
return http_headers | |
def parse_url(http_header): | |
""" | |
Parse the url from the http request header. | |
Note that this excludes the protocol, host, and port. | |
""" | |
if http_header.startswith("GET"): | |
start_pos = 4 | |
elif http_header.startswith("POST"): | |
start_pos = 5 | |
else: | |
assert False, "http header does not start with GET or POST" | |
end_pos = http_header.find(" HTTP/") | |
assert end_pos != -1 | |
return http_header[start_pos:end_pos] | |
def to_warc_record(warc_type, url, http_header=None, http_body=None, concurrent_to_warc_record=None, | |
headers=None): | |
warc_headers = { | |
"WARC-Target-URI": url, | |
"WARC-Type": warc_type | |
} | |
if headers: | |
warc_headers.update(headers) | |
if concurrent_to_warc_record: | |
warc_headers["WARC-Concurrent-To"] = concurrent_to_warc_record.header.record_id | |
payload = None | |
if http_header: | |
payload = http_header | |
if http_body: | |
if payload: | |
payload += "\r\n" + http_body | |
else: | |
payload = http_body | |
return ia_warc.WARCRecord(payload=payload, headers=warc_headers) | |
#Construct an API | |
api = flickrapi.FlickrAPI("<YOUR KEY HERE>", "<YOUR SECRET HERE>", store_token=False) | |
#Get info on the photo | |
request_record, response_record, raw_json_resp = wrap_api_call( | |
lambda: api.photos.getInfo(photo_id="16610484049", secret="ee80d9ecdc", format='json'), | |
httplib.HTTPConnection, "https://api.flickr.com") | |
#Write to warc | |
warc = ia_warc.open("technique1.warc", "w") | |
try: | |
warc.write_record(request_record) | |
warc.write_record(response_record) | |
finally: | |
warc.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment