lvidarte/bom.py

## bom.py
import codecs

# By Doug Hellmann

# Multibytes encodings, such as UTF-16 and UTF-32, pose a problem
# when transferring data between different computer systems, either
# by copying a file directly or using network communication. Different
# systems use different ordering of the high- and low-order bytes.
# This characteristic of the data, known as its endianness, depends
# on factors such as the hardware architecture and choices made by
# the operating system and application developer. There is not always
# a way to know in advance what byte order to use for a given set of
# data, so the multibyte encodings include a byte-order marker (BOM)
# as the first few bytes of encoded output. For example, UTF-16 is
# defined in such a way that 0xFFFE and 0xFEFF are not valid
# characters and can be used to indicate the byte-order.
# codecs defines constants for the byte-order markers used by UTF-16
# and UTF-32

with open('/tmp/pi.txt', mode='wb') as f:
    f.write(codecs.BOM_UTF16_BE)
    f.write(u'pi: \u03c0'.encode('utf_16_be'))

with codecs.open('/tmp/pi.txt', mode='rt', encoding='utf-16') as f:
    print f.read()
	import codecs

	# By Doug Hellmann

	# Multibytes encodings, such as UTF-16 and UTF-32, pose a problem
	# when transferring data between different computer systems, either
	# by copying a file directly or using network communication. Different
	# systems use different ordering of the high- and low-order bytes.
	# This characteristic of the data, known as its endianness, depends
	# on factors such as the hardware architecture and choices made by
	# the operating system and application developer. There is not always
	# a way to know in advance what byte order to use for a given set of
	# data, so the multibyte encodings include a byte-order marker (BOM)
	# as the first few bytes of encoded output. For example, UTF-16 is
	# defined in such a way that 0xFFFE and 0xFEFF are not valid
	# characters and can be used to indicate the byte-order.
	# codecs defines constants for the byte-order markers used by UTF-16
	# and UTF-32

	with open('/tmp/pi.txt', mode='wb') as f:
	f.write(codecs.BOM_UTF16_BE)
	f.write(u'pi: \u03c0'.encode('utf_16_be'))

	with codecs.open('/tmp/pi.txt', mode='rt', encoding='utf-16') as f:
	print f.read()