Skip to content

Instantly share code, notes, and snippets.

@justinlittman
Last active October 14, 2015 19:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save justinlittman/a46ab82f456423a71e39 to your computer and use it in GitHub Desktop.
Save justinlittman/a46ab82f456423a71e39 to your computer and use it in GitHub Desktop.
Recording Flickr API calls to WARC using httplib debugging
WARC/1.0
WARC-Type: request
Content-Length: 528
WARC-Date: 2015-10-14T18:50:56Z
WARC-Payload-Digest: sha1:2af4ebafc68cda47bc56df9047c5d46457690d54
WARC-Target-URI: https://api.flickr.com/services/rest/?photo_id=16610484049&secret=ee80d9ecdc&nojsoncallback=1&method=flickr.photos.getInfo&format=json
Content-Type: application/http; msgtype=request
WARC-Record-ID: <urn:uuid:80ae5e4c-72a4-11e5-a45f-2cf0ee020fec>
POST /services/rest/?photo_id=16610484049&secret=ee80d9ecdc&nojsoncallback=1&method=flickr.photos.getInfo&format=json HTTP/1.1
Host: api.flickr.com
Content-Length: 0
Accept-Encoding: gzip, deflate
Accept: */*
User-Agent: python-requests/2.8.1
Connection: close
Authorization: OAuth oauth_nonce="53352062120420734791444848655", oauth_timestamp="1444848655", oauth_version="1.0", oauth_signature_method="HMAC-SHA1", oauth_consumer_key="abddfe6fb8bba36e8ef0278ec65dbbc8", oauth_signature="k82HBMNR23WIiezIqopt4me8YYk%3D"
WARC/1.0
WARC-Type: response
Content-Length: 1743
WARC-Date: 2015-10-14T18:50:56Z
WARC-Payload-Digest: sha1:0e48c8a05ab98e0da27bf85699bacf80d951ad8e
WARC-Target-URI: https://api.flickr.com/services/rest/?photo_id=16610484049&secret=ee80d9ecdc&nojsoncallback=1&method=flickr.photos.getInfo&format=json
Content-Type: application/http; msgtype=response
WARC-Record-ID: <urn:uuid:80b4b563-72a4-11e5-9168-2cf0ee020fec>
HTTP/1.1 200 OK
Date: Wed, 14 Oct 2015 18:50:56 GMT
Content-Type: application/json
Content-Length: 542
P3P: policyref="https://policies.yahoo.com/w3c/p3p.xml", CP="CAO DSP COR CUR ADM DEV TAI PSA PSD IVAi IVDi CONi TELo OTPi OUR DELi SAMi OTRi UNRi PUBi IND PHY ONL UNI PUR FIN COM NAV INT DEM CNT STA POL HEA PRE LOC GOV"
Cache-Control: private
X-Served-By: www304.flickr.bf1.yahoo.com
Vary: Accept-Encoding
Content-Encoding: gzip
Age: 0
Via: http/1.1 fts128.flickr.bf1.yahoo.com (ApacheTrafficServer [cMsSf ]), http/1.1 r01.ycpi.nyc.yahoo.net (ApacheTrafficServer [cMsSf ])
Server: ATS
Connection: close
Y-Trace: BAEAQAAAAAAuXNR7XErSagAAAAAAAAAAxGSLpp1Ayb0AAAAAAAAAAAAFIhUK60oqAAUiFQrtGCuFIsgnAAAAAA--
{"photo":{"id":"16610484049","secret":"ee80d9ecdc","server":"8751","farm":9,"dateuploaded":"1426191780","isfavorite":0,"license":"0","safety_level":"0","rotation":0,"originalsecret":"d205cb161b","originalformat":"jpg","owner":{"nsid":"131866249@N02","username":"justin.littman","realname":"Justin Littman","location":"","iconserver":"0","iconfarm":0,"path_alias":null},"title":{"_content":"IMG_7599"},"description":{"_content":""},"visibility":{"ispublic":1,"isfriend":0,"isfamily":0},"dates":{"posted":"1426191780","taken":"2013-03-29 15:20:10","takengranularity":"0","takenunknown":"0","lastupdate":"1426251618"},"views":"0","editability":{"cancomment":0,"canaddmeta":0},"publiceditability":{"cancomment":1,"canaddmeta":0},"usage":{"candownload":1,"canblog":0,"canprint":0,"canshare":1},"comments":{"_content":"0"},"notes":{"note":[]},"people":{"haspeople":0},"tags":{"tag":[]},"urls":{"url":[{"type":"photopage","_content":"https:\/\/www.flickr.com\/photos\/131866249@N02\/16610484049\/"}]},"media":"photo"},"stat":"ok"}
import flickrapi
import StringIO
import sys
import warc as ia_warc
import httplib
def wrap_api_call(exec_func, debuggable, base_url):
#Execute the method and capture the output
raw_resp, capture_out = wrap_execute(exec_func, debuggable)
#Parse the captured output
http_headers = parse_capture(capture_out)
assert len(http_headers) == 2
#Reconstruct the url
url = base_url + parse_url(http_headers[0])
#Create warc records
request_record = to_warc_record("request", url, http_header=http_headers[0])
#Create response record
response_record = to_warc_record("response", url, http_body=raw_resp, http_header=http_headers[1])
return request_record, response_record, raw_resp
def wrap_execute(exec_func, debuggable):
"""
Enables debugging on debuggable, calls an API function, and captures output.
"""
#When debuglevel is set httplib outputs details to stdout.
#This captures stdout.
capture_out = StringIO.StringIO()
sys.stdout = capture_out
#sys.stdout = Tee([capture_out, sys.__stdout__])
debuggable.debuglevel = 1
try:
return_values = exec_func()
finally:
#Stop capturing stdout
sys.stdout = sys.__stdout__
debuggable.debuglevel = 0
return return_values, capture_out
def parse_capture(capture_out):
"""
Transform the captured stdout into a series of request and response headers
"""
http_headers = []
#Reset to the beginning of capture_out
capture_out.seek(0)
response_header = None
for line in capture_out:
if line.startswith("send:"):
#Push last req and resp
if response_header:
#Response record
http_headers.append(response_header)
response_header = None
start = line.find("GET")
if start == -1:
start = line.find("POST")
assert start != -1
request_header = line[start:-2].replace("\\r\\n", "\r\n")
#Request record
http_headers.append(request_header)
elif line.startswith("reply:"):
#Start of the response header
response_header = line[8:-6] + "\r\n"
elif line.startswith("header:"):
#Append additional headers to response header
response_header += line[8:-2] + "\r\n"
#Push the last response
http_headers.append(response_header)
return http_headers
def parse_url(http_header):
"""
Parse the url from the http request header.
Note that this excludes the protocol, host, and port.
"""
if http_header.startswith("GET"):
start_pos = 4
elif http_header.startswith("POST"):
start_pos = 5
else:
assert False, "http header does not start with GET or POST"
end_pos = http_header.find(" HTTP/")
assert end_pos != -1
return http_header[start_pos:end_pos]
def to_warc_record(warc_type, url, http_header=None, http_body=None, concurrent_to_warc_record=None,
headers=None):
warc_headers = {
"WARC-Target-URI": url,
"WARC-Type": warc_type
}
if headers:
warc_headers.update(headers)
if concurrent_to_warc_record:
warc_headers["WARC-Concurrent-To"] = concurrent_to_warc_record.header.record_id
payload = None
if http_header:
payload = http_header
if http_body:
if payload:
payload += "\r\n" + http_body
else:
payload = http_body
return ia_warc.WARCRecord(payload=payload, headers=warc_headers)
#Construct an API
api = flickrapi.FlickrAPI("<YOUR KEY HERE>", "<YOUR SECRET HERE>", store_token=False)
#Get info on the photo
request_record, response_record, raw_json_resp = wrap_api_call(
lambda: api.photos.getInfo(photo_id="16610484049", secret="ee80d9ecdc", format='json'),
httplib.HTTPConnection, "https://api.flickr.com")
#Write to warc
warc = ia_warc.open("technique1.warc", "w")
try:
warc.write_record(request_record)
warc.write_record(response_record)
finally:
warc.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment