gboudreau/easymalt-headers-fix.patch

## easymalt-headers-fix.patch
# HG changeset patch
# Parent  60085c8f01fe4eb19a1c38a2d27fd77698b5a5ec
Issue #24363: Add policy flag to avoid parsing HTTP header as email body

diff -r 60085c8f01fe Lib/email/errors.py
--- a/Lib/email/errors.py	Thu Sep 08 22:37:34 2016 -0400
+++ b/Lib/email/errors.py	Mon Jan 23 23:39:53 2017 +0000
@@ -55,8 +55,9 @@

 class MissingHeaderBodySeparatorDefect(MessageDefect):
     """Found line with no leading whitespace and no colon before blank line."""
-# XXX: backward compatibility, just in case (it was never emitted).
-MalformedHeaderDefect = MissingHeaderBodySeparatorDefect
+
+class MalformedHeaderDefect(MessageDefect):
+    """An ordinary header line did not match the expected format."""

 class MultipartInvariantViolationDefect(MessageDefect):
     """A message claimed to be a multipart but no subparts were found."""
diff -r 60085c8f01fe Lib/email/feedparser.py
--- a/Lib/email/feedparser.py	Thu Sep 08 22:37:34 2016 -0400
+++ b/Lib/email/feedparser.py	Mon Jan 23 23:39:53 2017 +0000
@@ -169,6 +169,9 @@
         self._last = None
         self._headersonly = False

+        # True to parse a HTTP header section that is detached from any body
+        self.__body_detached = getattr(policy, "_py_body_detached", False)
+
     # Non-public interface for supporting Parser's headersonly flag
     def _set_headersonly(self):
         self._headersonly = True
@@ -233,9 +236,18 @@
                 # (i.e. newline), just throw it away. Otherwise the line is
                 # part of the body so push it back.
                 if not NLCRE.match(line):
-                    defect = errors.MissingHeaderBodySeparatorDefect()
+                    if self.__body_detached:
+                        defect = "Invalid header line: " + repr(line)
+                        defect = errors.MalformedHeaderDefect(defect)
+                    else:
+                        defect = errors.MissingHeaderBodySeparatorDefect()
+                        self._input.unreadline(line)
                     self.policy.handle_defect(self._cur, defect)
-                    self._input.unreadline(line)
+                if self.__body_detached:
+                    # Even in the case of a blank line, this could be "\r",
+                    # which the HTTP parser does not consider to be the last
+                    # line, so ignore it in case other header fields follow
+                    continue
                 break
             headers.append(line)
         # Done with the headers, so parse them and figure out what we're
@@ -322,7 +334,7 @@
                     lines.append(line)
                 self._cur.set_payload(EMPTYSTRING.join(lines))
                 return
-            # Make sure a valid content type was specified per RFC 2045:6.4.
+            # Make sure a valid encoding was specified per RFC 2045:6.4.
             if (self._cur.get('content-transfer-encoding', '8bit').lower()
                     not in ('7bit', '8bit', 'binary')):
                 defect = errors.InvalidMultipartContentTransferEncodingDefect()
@@ -499,7 +511,7 @@
                         line = line[:-len(mo.group(0))]
                     self._cur.set_unixfrom(line)
                     continue
-                elif lineno == len(lines) - 1:
+                elif not self.__body_detached and lineno == len(lines) - 1:
                     # Something looking like a unix-from at the end - it's
                     # probably the first line of the body, so push back the
                     # line and stop.
@@ -511,10 +523,13 @@
                     defect = errors.MisplacedEnvelopeHeaderDefect(line)
                     self._cur.defects.append(defect)
                     continue
+
             # Split the line on the colon separating field name from value.
-            # There will always be a colon, because if there wasn't the part of
-            # the parser that calls us would have started parsing the body.
-            i = line.find(':')
+            # There will always be a colon, because if there wasn't,
+            # it would have been picked up by the part of
+            # the parser that calls us, or the continuation or envelope
+            # checks above.
+            i = line.index(':')

             # If the colon is on the start of the line the header is clearly
             # malformed, but we might be able to salvage the rest of the
@@ -524,7 +539,6 @@
                 self._cur.defects.append(defect)
                 continue

-            assert i>0, "_parse_headers fed line with no : and no leading WS"
             lastheader = line[:i]
             lastvalue = [line]
         # Done with all the lines, so handle the last header.
diff -r 60085c8f01fe Lib/http/client.py
--- a/Lib/http/client.py	Thu Sep 08 22:37:34 2016 -0400
+++ b/Lib/http/client.py	Mon Jan 23 23:39:53 2017 +0000
@@ -69,6 +69,7 @@
 """

 import email.parser
+import email.policy
 import email.message
 import http
 import io
@@ -191,14 +192,16 @@
                 lst.append(line)
         return lst

+class _Policy(email.policy.Compat32):
+    _py_body_detached = True
+
 def parse_headers(fp, _class=HTTPMessage):
     """Parses only RFC2822 headers from a file pointer.

-    email Parser wants to see strings rather than bytes.
-    But a TextIOWrapper around self.rfile would buffer too many bytes
-    from the stream, bytes which we later need to read as bytes.
-    So we read the correct bytes here, as bytes, for email Parser
-    to parse.
+    The parser works with text strings rather than bytes.
+    But a TextIOWrapper may internally buffer too many bytes from the stream,
+    bytes which we later need to read.  So we read the correct number of
+    bytes from the stream before decoding them to text to be parsed.

     """
     headers = []
@@ -212,7 +215,8 @@
         if line in (b'\r\n', b'\n', b''):
             break
     hstring = b''.join(headers).decode('iso-8859-1')
-    return email.parser.Parser(_class=_class).parsestr(hstring)
+    parser = email.parser.Parser(_class=_class, policy=_Policy())
+    return parser.parsestr(hstring)


 class HTTPResponse(io.BufferedIOBase):
	# HG changeset patch
	# Parent 60085c8f01fe4eb19a1c38a2d27fd77698b5a5ec
	Issue #24363: Add policy flag to avoid parsing HTTP header as email body

	diff -r 60085c8f01fe Lib/email/errors.py
	--- a/Lib/email/errors.py Thu Sep 08 22:37:34 2016 -0400
	+++ b/Lib/email/errors.py Mon Jan 23 23:39:53 2017 +0000
	@@ -55,8 +55,9 @@

	class MissingHeaderBodySeparatorDefect(MessageDefect):
	"""Found line with no leading whitespace and no colon before blank line."""
	-# XXX: backward compatibility, just in case (it was never emitted).
	-MalformedHeaderDefect = MissingHeaderBodySeparatorDefect
	+
	+class MalformedHeaderDefect(MessageDefect):
	+ """An ordinary header line did not match the expected format."""

	class MultipartInvariantViolationDefect(MessageDefect):
	"""A message claimed to be a multipart but no subparts were found."""
	diff -r 60085c8f01fe Lib/email/feedparser.py
	--- a/Lib/email/feedparser.py Thu Sep 08 22:37:34 2016 -0400
	+++ b/Lib/email/feedparser.py Mon Jan 23 23:39:53 2017 +0000
	@@ -169,6 +169,9 @@
	self._last = None
	self._headersonly = False

	+ # True to parse a HTTP header section that is detached from any body
	+ self.__body_detached = getattr(policy, "_py_body_detached", False)
	+
	# Non-public interface for supporting Parser's headersonly flag
	def _set_headersonly(self):
	self._headersonly = True
	@@ -233,9 +236,18 @@
	# (i.e. newline), just throw it away. Otherwise the line is
	# part of the body so push it back.
	if not NLCRE.match(line):
	- defect = errors.MissingHeaderBodySeparatorDefect()
	+ if self.__body_detached:
	+ defect = "Invalid header line: " + repr(line)
	+ defect = errors.MalformedHeaderDefect(defect)
	+ else:
	+ defect = errors.MissingHeaderBodySeparatorDefect()
	+ self._input.unreadline(line)
	self.policy.handle_defect(self._cur, defect)
	- self._input.unreadline(line)
	+ if self.__body_detached:
	+ # Even in the case of a blank line, this could be "\r",
	+ # which the HTTP parser does not consider to be the last
	+ # line, so ignore it in case other header fields follow
	+ continue
	break
	headers.append(line)
	# Done with the headers, so parse them and figure out what we're
	@@ -322,7 +334,7 @@
	lines.append(line)
	self._cur.set_payload(EMPTYSTRING.join(lines))
	return
	- # Make sure a valid content type was specified per RFC 2045:6.4.
	+ # Make sure a valid encoding was specified per RFC 2045:6.4.
	if (self._cur.get('content-transfer-encoding', '8bit').lower()
	not in ('7bit', '8bit', 'binary')):
	defect = errors.InvalidMultipartContentTransferEncodingDefect()
	@@ -499,7 +511,7 @@
	line = line[:-len(mo.group(0))]
	self._cur.set_unixfrom(line)
	continue
	- elif lineno == len(lines) - 1:
	+ elif not self.__body_detached and lineno == len(lines) - 1:
	# Something looking like a unix-from at the end - it's
	# probably the first line of the body, so push back the
	# line and stop.
	@@ -511,10 +523,13 @@
	defect = errors.MisplacedEnvelopeHeaderDefect(line)
	self._cur.defects.append(defect)
	continue
	+
	# Split the line on the colon separating field name from value.
	- # There will always be a colon, because if there wasn't the part of
	- # the parser that calls us would have started parsing the body.
	- i = line.find(':')
	+ # There will always be a colon, because if there wasn't,
	+ # it would have been picked up by the part of
	+ # the parser that calls us, or the continuation or envelope
	+ # checks above.
	+ i = line.index(':')

	# If the colon is on the start of the line the header is clearly
	# malformed, but we might be able to salvage the rest of the
	@@ -524,7 +539,6 @@
	self._cur.defects.append(defect)
	continue

	- assert i>0, "_parse_headers fed line with no : and no leading WS"
	lastheader = line[:i]
	lastvalue = [line]
	# Done with all the lines, so handle the last header.
	diff -r 60085c8f01fe Lib/http/client.py
	--- a/Lib/http/client.py Thu Sep 08 22:37:34 2016 -0400
	+++ b/Lib/http/client.py Mon Jan 23 23:39:53 2017 +0000
	@@ -69,6 +69,7 @@
	"""

	import email.parser
	+import email.policy
	import email.message
	import http
	import io
	@@ -191,14 +192,16 @@
	lst.append(line)
	return lst

	+class _Policy(email.policy.Compat32):
	+ _py_body_detached = True
	+
	def parse_headers(fp, _class=HTTPMessage):
	"""Parses only RFC2822 headers from a file pointer.

	- email Parser wants to see strings rather than bytes.
	- But a TextIOWrapper around self.rfile would buffer too many bytes
	- from the stream, bytes which we later need to read as bytes.
	- So we read the correct bytes here, as bytes, for email Parser
	- to parse.
	+ The parser works with text strings rather than bytes.
	+ But a TextIOWrapper may internally buffer too many bytes from the stream,
	+ bytes which we later need to read. So we read the correct number of
	+ bytes from the stream before decoding them to text to be parsed.

	"""
	headers = []
	@@ -212,7 +215,8 @@
	if line in (b'\r\n', b'\n', b''):
	break
	hstring = b''.join(headers).decode('iso-8859-1')
	- return email.parser.Parser(_class=_class).parsestr(hstring)
	+ parser = email.parser.Parser(_class=_class, policy=_Policy())
	+ return parser.parsestr(hstring)


	class HTTPResponse(io.BufferedIOBase):