liuyu81/cgi_pipeline.py

## cgi_pipeline.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
    SnapSearch.cgi
    ~~~~~~~~~~~~~~

    :copyright: 2014 by `SnapSearch <https://snapsearch.io/>`_
    :license: MIT, see LICENSE for more details.

    :author: `LIU Yu <liuyu@opencps.net>`_
    :date: 2014/03/08
"""

__all__ = ['main', ]


import os
import sys

from SnapSearch.wsgi import _extract_snapshot
from SnapSearch._compat import DEBUG


def main(interceptor):

    # start interception
    response = None
    try:
        response = interceptor(os.environ)
    except:
        pass

    # non-intercepted response
    if not isinstance(response, dict):
        for line in sys.stdin:
            sys.stdout.write(line)
        return 0

    # intercepted response
    status, headers, payload = _build_message(response)
    sys.stdout.write(b"Status: %s\r\n" % status)
    for key, val in headers:
        sys.stdout.write(b"%s: %s\r\n" % (key, val))
    sys.stdout.write(b"\r\n")
    sys.stdout.write(payload)

    return 0


if __name__ == '__main__':

    # SnapSearch API credentials
    api_email = "<email>"
    api_key = "<key>"

    # initialize Client and Detector
    from SnapSearch import Client, Detector
    client = Client(api_email, api_key)
    detector = Detector()

    from SnapSearch import Interceptor
    interceptor = Interceptor(client, detector)

    sys.exit(main(interceptor))

## get_response_pycurl.py
def client_dispatch(self, **kwds):

    # HTTPS connection
    import pycurl
    c = pycurl.Curl()
    c.setopt(pycurl.URL, kwds['url'])

    # HTTPS request
    payload = wsgi_to_bytes(kwds['payload'])
    headers = ["Content-Type: application/json",
               "Content-Length: %d" % len(payload)]

    c.setopt(pycurl.POST, True)
    c.setopt(pycurl.HTTPHEADER, headers)
    c.setopt(pycurl.POSTFIELDS, payload)

    # HTTPS auth
    c.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_BASIC)
    c.setopt(pycurl.USERPWD, "%s:%s" % (kwds['email'], kwds['key']))
    c.setopt(pycurl.CAINFO, kwds['cacert'])
    c.setopt(pycurl.SSL_VERIFYPEER, True)
    c.setopt(pycurl.SSL_VERIFYHOST, 2)

    c.setopt(pycurl.ENCODING, "")
    c.setopt(pycurl.FOLLOWLOCATION, True)
    c.setopt(pycurl.MAXREDIRS, 5)
    c.setopt(pycurl.CONNECTTIMEOUT, 5)
    c.setopt(pycurl.TIMEOUT, 30)

    buffer = bytearray()
    c.setopt(pycurl.HEADER, True)
    c.setopt(pycurl.WRITEFUNCTION, buffer.extend)

    try:
        c.perform()
        # response status
        eos = buffer.find(b"\r\n")  # end of status line
        strip = lambda b: bytes(b).strip()
        preamble = map(strip, buffer[:eos].split(" ", 2))
        assert(len(preamble) > 2), "invalid SnapSearch response"
        status = int(preamble[1])
        # response headers
        eoh = buffer.find(b"\r\n\r\n")  # end of header lines
        strip_kv = lambda (k, v): (k.strip().lower(), v.strip())
        split_cma = lambda b: strip_kv(bytes(b).split(":", 1))
        headers = dict(map(split_cma, buffer[eos + 2: eoh].splitlines()))
        # response body
        body = bytes(buffer[eoh + 4:]).strip()
    except Exception as e:
        raise SnapSearchError(e)
    else:
        return {'status': status, 'headers': headers, 'body': body}
    finally:
        c.close()

    return None

## memo_01.rst

      
    Raw
  

              memo_01.rst
            
          
    Development Memo I.


resource bundle
Issue (1): PHP and Ruby code set the default location of robots.json
and extensions.json to relative path ../../resources/, this may
cause trouble for Python code packaging (i.e. in order to support pip /
easy_install management). When installed, the package is at,
/usr/lib/python<ver>/site-packages/SnapSearch/
Options:

/usr/lib/python<ver>/site-packages/SnapSearch/resources/
/etc/SnapSearch/resources/
/usr/share/doc/SnapSearch-Client-Python/resources/

Current implementation installs the resources to both locations 1) and 3)


test data for Detector from PHP code
Issue (2): Some of the HTTP requests data from the PHP test cases are
invalid per CGI 1.1 / WSGI 1.0.1 specs. The variables SCRIPT_NAME and
PATH_INFO are both missing. Although raw REQUEST_URI is typically
the join of SCRIPT_NAME and PATH_INFO, this is undefiend in
either CGI 1.1 or WSGI 1.0.1 specs.
Note: I imporeted the test data from PHP test cases, and fixed them to
conform CGI 1.1 standard. The original data in the PHP test cases should be
fixed as well.


interface of Detector
Issue (3a): robots.json and extensions.json must be specified as
absolute path to files. Even if the developer does not specify these
resources, the default version of these files still need to be loaded from
disk. There are two issues with this design,

feasibility, developers may want to specify file contents as strings
efficiency, with every contruction there comes 2 disk read operations

Options:

take file content string as input argument
take absolute path as input argument

Note: current implementation conforms with 2) but the Pythonic ways is 1).
Issue (3b): the constructor of Detector takes HTTP request and
check_file_extensions flag as input. It seems more natural to feed
these to arguments directly to Detector.detect().
Options:


Detector.__init__(ignored_route, matched_route, robots_json, extensions_json)
# detect() returns ``encoded_url`` for interception or ``None`` for not
Detector.detect(request, check_file_extension)


Detector.__init__(ignored_route, matched_route, request, check_file_extensions, robots_json, extensions_json)
Detector.detect()
Detector.get_encoded_url()
Note: current implementation conforms with 2) but I personally prefer 1)


documentation markup
The preferred markup language for writing documents and comments in Python
is reStructuredText (reST). In order for the project README to be
correctly displayed at PyPI (the official python package index), I decided
to compose documents with reST instead of markdown.


## memo_02.rst

      
    Raw
  

              memo_02.rst
            
          
    Development Memo II.


sample data for triggerring validation_error and other system errors
Issue (5): to ensure 100% test coverage, I need to trigger error responses
from SnapSearch service. Current implementation of Client() will raise an
exception locally if the input request_parameters cannot pass json
conversion. So the sample data needs to be json-serializable.
Note: I was able to trigger validation_error by sending illegal json
data as the payload in Client.request(). But this will bypass the Client()
code, and is pointless for coverage test.


custom api_url
Issue (6): the interface of Client object allows the developer to customize
api url. In case the scheme of this url is "http" instead of "https", the
behaviour is undefined*.
The PHP and Ruby code do not check the url scheme.
Options:

treat non-https url_api as illegal, and raise exception
accept non-https url_api and skip SSL verification (this is insecure
because HTTP basic auth without SSL will leak api email / key).

Note: current implementation follows 1).


api credentials for travis-ci integration.
Issue (7): it is both insecure and infeasible to hard-code api_email
and api_key within the test cases. To support unsupervised testing on
Travis-CI, I employed the encryption keys feature. In particular, an
encrypted message containing the environment variable,
SNAPSEARCH_API_CREDENTIALS=<email>:<key>
has been added to .travis.yml, i.e.,
env:
  global:
    secure: ".... encrypted data ...."
When the demo api credentials have been changed, the developer can update
the encrypted message through,
$ travis encrypt SNAPSEARCH_API_CREDENTIALS=<new_email>:<new_key>


## memo_03.rst

      
    Raw
  

              memo_03.rst
            
          
    Development Memo III.


encoding of backend service response

Issue (8): the backend service does not contain an explicit character set
encoding. When using requests as the HTTP library, it can perform a so
called educational guess of the encoding. But for pycurl, this cannot be
deduced automatically. Current implementation assumes "utf-8", but since
the backend service will include a snapshot within the response, it is
quite possible that the content actually uses an alternative character set
encoding. We need a clear specification on this.


refactor of the resource bundle

Issue (9) based on your response to Issue (1), I have reogranized the
package layout to make it straightforward to package and distribute. The
resources have been wrapped into a sub-package for backend api abstraction,
aka. SnapSearch.api. This sub-package contains functionalities -- such
as request encoding and response parsing -- that are heavily coupled with
the backend service. The rest of the Pythonic client should be (forwardly)
compatible with furture versions of the backend services.


naming of Client is a little bit confusing
Technically, a portal used for communicating the backend service is a part
of the backend (or an api to backend). Calling it Client may cause
confusion to developers, considering it is used in a web server app.

wrapping Interceptor as a WSGI middleware
The existing interface specification of Detector and Interceptor
does yot yield a clean use case for WSGI middleware. The constructor of
Detector (i.e. Detector.__init__() takes the environ (HTTP
request) as an input parameter. For WSGI applications, however, the HTTP
request is not available at the global scope, but noly as an input to
the application's __call__() method.
This means either (i) the Detector object needs to be constructed
within the WSGI middleware's __call__() method, or (ii) the
specification of Detector.__init__() needs to be changed.
For option (i), the WSGI middleware needs to have all 6 parameters needed
to constructor a Detector and this greatly violates the decoupling and
encapsulation principles of OOP.
For option (ii), the environ (HTTP request) can be supplied to
Detector.detect() instead of Detector.__init__(). This will make a
Detector object stateless with respect to the HTTP request.
I created a develop branch at the git repo (with version number 0.0.6) as
proof-of-concept for option (ii). It yields cleaner design from the
perspective of Web app developers. This reopens the discussed Issue (3) in
Memo I, so I will need your consent to proceed this approach.


## memo_04.rst

      
    Raw
  

              memo_04.rst
            
          
    Development Memo IV.


revisions since 0.0.6 release
1). made SnapSearch.api.SNAPSEARCH_API_FOLLOW_REDIRECT False by
default.

But I think we should keep the variable there for future use.
For example, if the backend service releases v1.1 of the robot api
serving at, say, "https://snapsearch.io/api/v1.1/robot". You may need
to redirect some or all clients to the new portal.

2). added SnapSearch.api.SNAPSEARCH_API_ACCEPT_ENCODING to represent
the list of encodings supported by the HTTP library in effective.

Note that, SNAPSEARCH_API_ACCEPT_ENCODING is an informational
constant (as opposed to configurable parameter) maintained by the HTTP
library selection process in SnapSearch.api.backend, and it should
remain unchanged for as long as the same HTTP library is in use.
More specifically, this value should never be altered from higher
level packages (or by users of this client library), because they have
no knowledge about the underlying HTTP library in use, not to mention
the transfer encodings that the HTTP library actually accepts.
In fact, I don't like the idea even to expose this value to the outside
of the api subpackage. For example, if the user set this to some
foo encoding -- not knowing that the underlying HTTP library does
not support this foo encoding -- it will definitely break the
communication with SnapSearch backend service, even if the backend
service itself understands foo.

3). added SnapSearch.wsgi module with InterceptorMiddleware.

The InterceptorMiddleware mimics the StackInterceptor in the
PHP code in terms of constructor signature and response_callback.
Because it is already placed in a sub-module called wsgi, I didn't
use the term WSGI in its name.
Unit test cases have also been created for InterceptorMiddleware,
and the test covers all but 5 lines of code in SnapSearch.wsgi,
which are simple flow control for extrememly rare anomalies.

4). completed the Flask_ example using the InterceptorMiddleware.

The code is already functional. Documentation is on the way.

5). development documents are now kept in a separate folder docs.

Unit testing and Travis-CI-based integration test including the
encrypted credentials are already covered.

6). working on a pipeline based CGI interceptor example.

Individual code snippets are already functional. there needs to be a
pipeline script joining the interceptor and the cgi program. Pipeline
might be the preferred interception for CGI programs because it will
keep the CGI program unchanged. The interceptor censors the standard
input / output streams between the CGI program and the web server.


## memo_05.rst

      
    Raw
  

              memo_05.rst
            
          
    Development Memo V.


enhancement proposals and recommendations
1). add an User-Agenet header in the HTTPS request sent to the backend
service.

As the diversification of SnapSearch client packages, the backend may
need this information to collect the usage statistics.
The recommend format of the HTTP User-Agent header are as follows,
a). User-Agent: snapsearch-client/<lang>
b). User-Agent: snapsearch-client-<lang>/<ver>
c). User-Agent: snapsearch-client (<lang>)
d). User-Agent: snapsearch-client (<lang>/<ver>)

2). unification of test data.

Each client package are using their own set of data for unit test, this
may lead to inconsistency among different client packages. One of the
approaches to resolve such inconsistency is to unify the data (i.e.
sample HTTP requests / responses) used in unit tests and isolate these
data from the test suite of each individual client package.
The Python client places the majority of test data in separate json
files, this is a viable solution to test data isolation.

3). regulate code base management.

For the PHP and Ruby client, all changes are pushed directly to the
master branch of the main git repo. I would recommend you to at
least use a develop branch to hold new changes for, say, a couple
of days for thorough verification, before committing them to the
master branch. Or more safely, use forked repo's and pull requests
to accept changes to the master code base. Pull requests need to
undergo Travis-CI integration tests before they are green to merge,
whereas direct pushes are committed-then-tested.

4). create an eye-catchy example.

Most of the examples in use are overly simplified. Consdering the
advantages of SnapSearch is in scraping js-intensive web pages. There
should be at least one js-intensive page in the examples.
This example need to be simple, yet solid for showcasing the
capabilities of the backend service.

5). documentation.

The Python client uses Sphinx for automated document generation. This
is a good practice that I would recommend to other client packages.


## php_array_to_json.py
#!/usr/bin/env python

import json
import os
import re
import sys

regex = re.compile(r"""
    \s*                 # the leading space
    '(\S+)'             # key
    \s*                 # the space between key and '=>'
    =>                  # '=>'
    \s*                 # the space between '=>' and value
    '([^']*)'           # the value
    \s*                 # the space bewteen value and ','
    ,                   # ','
    \s*                 # the tailing space
    \n                  # all key-value pairs are on separate lines
    """, re.VERBOSE)

raw = sys.stdin.read()

data = {}
for k, v in regex.findall(raw):
    # update SCRIPT_NAME, PATH_INFO, and QUERY_STRING with REQUEST_URI
    if k == "REQUEST_URI":
        uri = v.split('?', 1)
        path, qs = uri[0], uri[1] if len(uri) > 1 else None
        # query string
        if qs:
            data["QUERY_STRING"] = qs
        # route path
        tup = path.split('/')
#        if len(tup) > 1 and not tup[0]:
#            data["SCRIPT_NAME"] = ""
#            data["PATH_INFO"] = path
        if len(tup) > 2 and tup[-1]:
            data["SCRIPT_NAME"] = '/'.join(tup[:-1])
            data["PATH_INFO"] = '/' + tup[-1]
        else:
            data["SCRIPT_NAME"] = path
            data["PATH_INFO"] = ""
        pass
    # update HTTPS with REQUEST_SCHEME
    if k == "REQUEST_SCHEME":
        if v == "https":
            data["HTTPS"] = "on"
        continue
    data.setdefault(k, v)

print(json.dumps(data, indent=4))
	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""
	SnapSearch.cgi
	~~~~~~~~~~~~~~

	:copyright: 2014 by `SnapSearch <https://snapsearch.io/>`_
	:license: MIT, see LICENSE for more details.

	:author: `LIU Yu <liuyu@opencps.net>`_
	:date: 2014/03/08
	"""

	__all__ = ['main', ]


	import os
	import sys

	from SnapSearch.wsgi import _extract_snapshot
	from SnapSearch._compat import DEBUG


	def main(interceptor):

	# start interception
	response = None
	try:
	response = interceptor(os.environ)
	except:
	pass

	# non-intercepted response
	if not isinstance(response, dict):
	for line in sys.stdin:
	sys.stdout.write(line)
	return 0

	# intercepted response
	status, headers, payload = _build_message(response)
	sys.stdout.write(b"Status: %s\r\n" % status)
	for key, val in headers:
	sys.stdout.write(b"%s: %s\r\n" % (key, val))
	sys.stdout.write(b"\r\n")
	sys.stdout.write(payload)

	return 0


	if __name__ == '__main__':

	# SnapSearch API credentials
	api_email = "<email>"
	api_key = "<key>"

	# initialize Client and Detector
	from SnapSearch import Client, Detector
	client = Client(api_email, api_key)
	detector = Detector()

	from SnapSearch import Interceptor
	interceptor = Interceptor(client, detector)

	sys.exit(main(interceptor))
	def client_dispatch(self, **kwds):

	# HTTPS connection
	import pycurl
	c = pycurl.Curl()
	c.setopt(pycurl.URL, kwds['url'])

	# HTTPS request
	payload = wsgi_to_bytes(kwds['payload'])
	headers = ["Content-Type: application/json",
	"Content-Length: %d" % len(payload)]

	c.setopt(pycurl.POST, True)
	c.setopt(pycurl.HTTPHEADER, headers)
	c.setopt(pycurl.POSTFIELDS, payload)

	# HTTPS auth
	c.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_BASIC)
	c.setopt(pycurl.USERPWD, "%s:%s" % (kwds['email'], kwds['key']))
	c.setopt(pycurl.CAINFO, kwds['cacert'])
	c.setopt(pycurl.SSL_VERIFYPEER, True)
	c.setopt(pycurl.SSL_VERIFYHOST, 2)

	c.setopt(pycurl.ENCODING, "")
	c.setopt(pycurl.FOLLOWLOCATION, True)
	c.setopt(pycurl.MAXREDIRS, 5)
	c.setopt(pycurl.CONNECTTIMEOUT, 5)
	c.setopt(pycurl.TIMEOUT, 30)

	buffer = bytearray()
	c.setopt(pycurl.HEADER, True)
	c.setopt(pycurl.WRITEFUNCTION, buffer.extend)

	try:
	c.perform()
	# response status
	eos = buffer.find(b"\r\n") # end of status line
	strip = lambda b: bytes(b).strip()
	preamble = map(strip, buffer[:eos].split(" ", 2))
	assert(len(preamble) > 2), "invalid SnapSearch response"
	status = int(preamble[1])
	# response headers
	eoh = buffer.find(b"\r\n\r\n") # end of header lines
	strip_kv = lambda (k, v): (k.strip().lower(), v.strip())
	split_cma = lambda b: strip_kv(bytes(b).split(":", 1))
	headers = dict(map(split_cma, buffer[eos + 2: eoh].splitlines()))
	# response body
	body = bytes(buffer[eoh + 4:]).strip()
	except Exception as e:
	raise SnapSearchError(e)
	else:
	return {'status': status, 'headers': headers, 'body': body}
	finally:
	c.close()

	return None
	#!/usr/bin/env python

	import json
	import os
	import re
	import sys

	regex = re.compile(r"""
	\s* # the leading space
	'(\S+)' # key
	\s* # the space between key and '=>'
	=> # '=>'
	\s* # the space between '=>' and value
	'([^']*)' # the value
	\s* # the space bewteen value and ','
	, # ','
	\s* # the tailing space
	\n # all key-value pairs are on separate lines
	""", re.VERBOSE)

	raw = sys.stdin.read()

	data = {}
	for k, v in regex.findall(raw):
	# update SCRIPT_NAME, PATH_INFO, and QUERY_STRING with REQUEST_URI
	if k == "REQUEST_URI":
	uri = v.split('?', 1)
	path, qs = uri[0], uri[1] if len(uri) > 1 else None
	# query string
	if qs:
	data["QUERY_STRING"] = qs
	# route path
	tup = path.split('/')
	# if len(tup) > 1 and not tup[0]:
	# data["SCRIPT_NAME"] = ""
	# data["PATH_INFO"] = path
	if len(tup) > 2 and tup[-1]:
	data["SCRIPT_NAME"] = '/'.join(tup[:-1])
	data["PATH_INFO"] = '/' + tup[-1]
	else:
	data["SCRIPT_NAME"] = path
	data["PATH_INFO"] = ""
	pass
	# update HTTPS with REQUEST_SCHEME
	if k == "REQUEST_SCHEME":
	if v == "https":
	data["HTTPS"] = "on"
	continue
	data.setdefault(k, v)

	print(json.dumps(data, indent=4))