RodrigoCMoraes/tokenize.py

## tokenize.py
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

def tokenize(string, to_lower=True, is_alpha=False):
  """Get string and return a list with tokens.

    Token is the most simple chain of characters without any type of separation.

    Args:
        string(str): array of characters.
        to_lower(bool, optional): if True turns all characters to lower case.
        is_alpha(bool, optional): if True returns just alphanumeric characters.

    Returns:
        list(str): list of tokens.

    Examples of Usage:
        >>> tokenize("This is Just an Example")
        ['this', 'is', 'just', 'an', 'example']

        >>> tokenize("this, is ,just an example.")
        ['this', ',', 'is', ',', 'just', 'an', 'example', '']

        >>> tokenize("This is Just an Example", is_alpha=True, to_lower=False)
        ['This', 'is', 'Just', 'an', 'Example']
    """

  if to_lower:
    if is_alpha:
      return [word.lower() for word in word_tokenize(string) if word.isalpha()]
    else:
      return [word for word in word_tokenize(string)]
  return [word for word in word_tokenize(string) if word.isalpha()]
	from nltk.tokenize import word_tokenize
	import nltk
	nltk.download('punkt')

	def tokenize(string, to_lower=True, is_alpha=False):
	"""Get string and return a list with tokens.

	Token is the most simple chain of characters without any type of separation.

	Args:
	string(str): array of characters.
	to_lower(bool, optional): if True turns all characters to lower case.
	is_alpha(bool, optional): if True returns just alphanumeric characters.

	Returns:
	list(str): list of tokens.

	Examples of Usage:
	>>> tokenize("This is Just an Example")
	['this', 'is', 'just', 'an', 'example']

	>>> tokenize("this, is ,just an example.")
	['this', ',', 'is', ',', 'just', 'an', 'example', '']

	>>> tokenize("This is Just an Example", is_alpha=True, to_lower=False)
	['This', 'is', 'Just', 'an', 'Example']
	"""

	if to_lower:
	if is_alpha:
	return [word.lower() for word in word_tokenize(string) if word.isalpha()]
	else:
	return [word for word in word_tokenize(string)]
	return [word for word in word_tokenize(string) if word.isalpha()]