/gist:8bd99eddc5ff4b682285

## gistfile1.txt
"""
Programming task
================

The following is an implementation of a simple Named Entity Recognition (NER).
NER is concerned with identifying place names, people names or other special
identifiers in text.

Here we make a very simple definition of a named entity: A sequence of
at least two consecutive capitalized words. E.g. "Los Angeles" is a named
entity, "our hotel" is not.

While the implementation passes the Unit test, it suffers from bad structure and
readability. It is your task to rework *both* the implementation and the Unit
test. You are expected to come up with a better interface than the one presented
here.

Your code will be evaluated on:
- Readability: Is naming intuitive? Are there comments where necessary?
- Structure: Is functionality grouped into functions or classes in a way that
enables reusability?
- Testability: Is it easy to test individual components of your algorithm? This
is a good indicator of good interface design.
- Bonus: Functional programming. Demonstrate how you have applied principles of
functional programming to improve this code.

If you want, explain reasons for changes you've made in comments.

Note that you don't have to improve the actual Named Entity Recognition
algorithm itself - the focus is on code quality.
"""

import re
import unittest

# Buffer to store current named entity
#word_buffer = []                                   #MG : not needed anymore
# Regular expression for matching a token at the beginning of a sentence
#token_re = re.compile(r"([a-z]+)\s*(.*)$", re.I)   #MG : not needed anymore
# Regular expression to recognize an uppercase token
uppercase_re = re.compile(r"[A-Z][a-z]*$")
#Regular expression to separate string at whitespace characters [\t\n\r\f\v]
space_re = re.compile(r"\s+")

""" Remarks :
1) Start with function get_named_entities() and rest is easy to follow (at least I think so :))
2) The new version of the code allows much more re usability by introducing Pure Functions for separate tasks
    which means minimal dependency on other methods, variables. So, functions can work independently - even in multithreaded environment
3) In the previous version, function pop_token(text) always receives full text as an argument and returns the remaining string.
    This incurs a lot of overhead since string is immutable type
4) The new function get_ne_from_buffer(word_buffer, entity_set = None) has an optional parameter entity_set to ensure re usability
5) Since we have pure functions, testing each function is easy.
6) global variable word_buffer is removed since it's not needed here and it incurs memory overhead
"""

def tokenize(text):
    """
    Tokenizes the text by splitting it at whitespace characters [\t\n\r\f\v]
    Returns list of tokens
    """
    return re.split(space_re, text)

def is_token_valid(token):
    """
    Returns true if its first letter is capitalized - we may have a named entity on our hands!!
	"""
    if uppercase_re.match(token):
        return True
    return False

def get_ne_from_buffer(word_buffer, entity_set = None):
    """
	Returns a named entity and adds it to the entity_set, if we have assembled one from the buffer.
	Returns None if we can't assemble one.
	"""

    if len(word_buffer) >= 2:
        named_entity = " ".join(word_buffer)
        if entity_set is not None:              #if entity_set provided then add named_entity to it
            entity_set.add(named_entity)
        return named_entity
    return None

def get_named_entities(text):
    """
    Returns a set of all the named entities from the text
    """
    entity_set = set()
    for line in text.splitlines():
        word_buffer = []
        for token in tokenize(line):
            if is_token_valid(token):
                word_buffer.append(token)
            else:
                get_ne_from_buffer(word_buffer, entity_set)
                word_buffer = []                    #empty the buffer
        get_ne_from_buffer(word_buffer, entity_set) #if we have named_entity at the end, we don't wana miss it!

    return entity_set

class NamedEntityTestCase(unittest.TestCase):
    def test_ner_extraction(self):
        """
        unit test for method get_named_entities()
        """
        text = 'When we went to Los Angeles last year we visited the Hollywood Sign'
        entities = get_named_entities(text)
        self.assertEqual(set(["Los Angeles", "Hollywood Sign"]), entities)

    def test_named_entity(self):
        """
        unit test for method get_ne_from_buffer()
        """
        buffer = ['Southern', 'California']
        named_entity = get_ne_from_buffer(buffer)
        self.assertEqual(named_entity, 'Southern California')

    def test_token_validity(self):
        """
        unit test for method is_token_valid()
        """
        token = 'Southern'
        validity = is_token_valid(token)
        self.assertEqual(validity, True)

    def test_tokenizer(self):
        """
        unit test for method tokenize()
        """
        text = 'Southern California is much like Arizona'
        tokens = tokenize(text)
        self.assertEqual(['Southern', 'California', 'is', 'much', 'like', 'Arizona'], tokens)

if __name__ == "__main__":
    unittest.main()
	"""
	Programming task
	================

	The following is an implementation of a simple Named Entity Recognition (NER).
	NER is concerned with identifying place names, people names or other special
	identifiers in text.

	Here we make a very simple definition of a named entity: A sequence of
	at least two consecutive capitalized words. E.g. "Los Angeles" is a named
	entity, "our hotel" is not.

	While the implementation passes the Unit test, it suffers from bad structure and
	readability. It is your task to rework both the implementation and the Unit
	test. You are expected to come up with a better interface than the one presented
	here.

	Your code will be evaluated on:
	- Readability: Is naming intuitive? Are there comments where necessary?
	- Structure: Is functionality grouped into functions or classes in a way that
	enables reusability?
	- Testability: Is it easy to test individual components of your algorithm? This
	is a good indicator of good interface design.
	- Bonus: Functional programming. Demonstrate how you have applied principles of
	functional programming to improve this code.

	If you want, explain reasons for changes you've made in comments.

	Note that you don't have to improve the actual Named Entity Recognition
	algorithm itself - the focus is on code quality.
	"""

	import re
	import unittest

	# Buffer to store current named entity
	#word_buffer = [] #MG : not needed anymore
	# Regular expression for matching a token at the beginning of a sentence
	#token_re = re.compile(r"([a-z]+)\s(.)$", re.I) #MG : not needed anymore
	# Regular expression to recognize an uppercase token
	uppercase_re = re.compile(r"[A-Z][a-z]*$")
	#Regular expression to separate string at whitespace characters [\t\n\r\f\v]
	space_re = re.compile(r"\s+")

	""" Remarks :
	1) Start with function get_named_entities() and rest is easy to follow (at least I think so :))
	2) The new version of the code allows much more re usability by introducing Pure Functions for separate tasks
	which means minimal dependency on other methods, variables. So, functions can work independently - even in multithreaded environment
	3) In the previous version, function pop_token(text) always receives full text as an argument and returns the remaining string.
	This incurs a lot of overhead since string is immutable type
	4) The new function get_ne_from_buffer(word_buffer, entity_set = None) has an optional parameter entity_set to ensure re usability
	5) Since we have pure functions, testing each function is easy.
	6) global variable word_buffer is removed since it's not needed here and it incurs memory overhead
	"""

	def tokenize(text):
	"""
	Tokenizes the text by splitting it at whitespace characters [\t\n\r\f\v]
	Returns list of tokens
	"""
	return re.split(space_re, text)

	def is_token_valid(token):
	"""
	Returns true if its first letter is capitalized - we may have a named entity on our hands!!
	"""
	if uppercase_re.match(token):
	return True
	return False

	def get_ne_from_buffer(word_buffer, entity_set = None):
	"""
	Returns a named entity and adds it to the entity_set, if we have assembled one from the buffer.
	Returns None if we can't assemble one.
	"""

	if len(word_buffer) >= 2:
	named_entity = " ".join(word_buffer)
	if entity_set is not None: #if entity_set provided then add named_entity to it
	entity_set.add(named_entity)
	return named_entity
	return None

	def get_named_entities(text):
	"""
	Returns a set of all the named entities from the text
	"""
	entity_set = set()
	for line in text.splitlines():
	word_buffer = []
	for token in tokenize(line):
	if is_token_valid(token):
	word_buffer.append(token)
	else:
	get_ne_from_buffer(word_buffer, entity_set)
	word_buffer = [] #empty the buffer
	get_ne_from_buffer(word_buffer, entity_set) #if we have named_entity at the end, we don't wana miss it!

	return entity_set

	class NamedEntityTestCase(unittest.TestCase):
	def test_ner_extraction(self):
	"""
	unit test for method get_named_entities()
	"""
	text = 'When we went to Los Angeles last year we visited the Hollywood Sign'
	entities = get_named_entities(text)
	self.assertEqual(set(["Los Angeles", "Hollywood Sign"]), entities)

	def test_named_entity(self):
	"""
	unit test for method get_ne_from_buffer()
	"""
	buffer = ['Southern', 'California']
	named_entity = get_ne_from_buffer(buffer)
	self.assertEqual(named_entity, 'Southern California')

	def test_token_validity(self):
	"""
	unit test for method is_token_valid()
	"""
	token = 'Southern'
	validity = is_token_valid(token)
	self.assertEqual(validity, True)

	def test_tokenizer(self):
	"""
	unit test for method tokenize()
	"""
	text = 'Southern California is much like Arizona'
	tokens = tokenize(text)
	self.assertEqual(['Southern', 'California', 'is', 'much', 'like', 'Arizona'], tokens)

	if __name__ == "__main__":
	unittest.main()