reuf/02_strings_text.py

## 02_strings_text.py
__author__ = 'muhamed.halilovic'
import re
import os
from urllib.request import urlopen
from fnmatch import fnmatch, fnmatchcase
from calendar import month_abbr
import unicodedata

# ------------------------------
# 2.1. Splitting Strings on Any of Multiple Delimiters
# ------------------------------

# Problem
# You need to split a string into fields, but the delimiters (and spacing around them) aren’t
# consistent throughout the string.

# The split() method of string objects is really meant for very simple cases, and does
# not allow for multiple delimiters or account for possible whitespace around the delimiters.
# In cases when you need a bit more flexibility, use the re.split() method:

line = 'asdf fjdk; afed, fjek,asdf, foo'
split1 = re.split(r'[;,\s]\s*', line)
split2 = re.split(r'(;|,|\s)\s*',line)
print(split1)
print(split2)
# ['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
# ['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']

values = split2[::2]
delimiters = split2[1::2]+['']

print(values)
print(delimiters)
# ['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
# [' ', ';', ',', ',', ',', '']
# Reform the line using the same delimiters
reformed = ''.join(v+d for v,d in zip(values, delimiters))
print(reformed)
# 'asdf fjdk;afed,fjek,asdf,foo'

# If you don’t want the separator characters in the result, but still need to use parentheses
# to group parts of the regular expression pattern, make sure you use a noncapture group,
# specified as (?:...). For example:

result = re.split(r'(?:,|;|\s)\s*',line)
print(result)
# ['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

# ------------------------------
# 2.2. Matching Text at the Start or End of a String
# ------------------------------

# Problem
# You need to check the start or end of a string for specific text patterns, such as filename
# extensions, URL schemes, and so on.

# A simple way to check the beginning or end of a string is to use the str.starts
# with() or str.endswith() methods. For example:

filename = 'spam.txt'
print(filename.endswith('.txt'))
# True
print(filename.startswith('file:'))
# False
url = 'http://www.staima.com'
print(url.startswith('http://www.'))
# True

# If you need to check against multiple choices, simply provide a tuple of possibilities to
# startswith() or endswith():
filenames = os.listdir('.')
print(filenames)
print([name for name in filenames if name.endswith(('.git','.py'))])
# ['.git', '.idea', '01_data_structures_algos.py', '02_strings_text.py', 'README.md', 'somefile1_3.txt']
# ['.git', '01_data_structures_algos.py', '02_strings_text.py']

print(any(name.endswith('.py') for name in filenames))
# True

def read_data(name):
    if name.startswith(('http:','https:','ftp:')):
        return urlopen(name).read()
    else:
        with open(name) as f:
            return f.read()


# Oddly, this is one part of Python where a tuple is actually required as input. If you happen
# to have the choices specified in a list or set, just make sure you convert them using
# tuple() first. For example:

choices = ['http:','ftp:']
url = 'http://www.python.org'
print(url.startswith(tuple(choices)))
# True

# Similar operations can be performed with slices, but
# are far less elegant. For example:

filename = 'spam.txt'
print(filename[-4:] == '.txt')
# True

url = 'http://www.python.org'
print(url[:5] == 'http:' or url[:6] == 'https:' or url[:4] == 'ftp:' )
# True

# You might also be inclined to use regular expressions as an alternative. For example:
url = 'http://www.python.org'
print(re.match('http:|https:|ftp:', url))
# <_sre.SRE_Match object; span=(0, 5), match='http:'>

# This works, but is often overkill for simple matching. Using this recipe is simpler and
# runs faster.

# Last, but not least, the startswith() and endswith() methods look nice when combined
# with other operations, such as common data reductions. For example, this statement
# that checks a directory for the presence of certain kinds of files:

# if any(name.endswith(('.git','.py')) for name in listdir('')):
#     ...

# ------------------------------
# 2.3. Matching Strings Using Shell Wildcard Patterns
# ------------------------------

# Problem
# You want to match text using the same wildcard patterns as are commonly used when
# working in Unix shells (e.g., *.py, Dat[0-9]*.csv, etc.).

print(fnmatch('foo.txt','*.txt'))
#True
print(fnmatch('foo.txt','?oo.txt'))
# True
print(fnmatch('Dat45.csv','Dat[0-9]*'))
# True

names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py']
print([name for name in names if fnmatch(name,'Dat*.csv')])
# ['Dat1.csv', 'Dat2.csv']

# Normally, fnmatch() matches patterns using the same case-sensitivity rules as the system’s
# underlying filesystem (which varies based on operating system). For example:

# On OSX or Linux
print(fnmatch('foo.txt','.TXT'))
# False
# On Windows:
# True

# fnmatchcase() for exact matching lowercase, uppercase, etc.

addresses = [
    '5412 N CLARK ST',
    '1060 W ADDISON ST',
    '1039 W GRANVILLE AVE',
    '2122 N CLARK ST',
    '4802 N BROADWAY',
]

print([addr for addr in addresses if fnmatch(addr, '* ST')])
# ['5412 N CLARK ST', '1060 W ADDISON ST', '2122 N CLARK ST']
print([addr for addr in addresses if fnmatch(addr, '54[0-9][0-9] *CLARK*')])
# ['5412 N CLARK ST']

# The matching performed by fnmatch sits somewhere between the functionality of simple
# string methods and the full power of regular expressions. If you’re just trying to
# provide a simple mechanism for allowing wildcards in data processing operations, it’s
# often a reasonable solution.

# If you’re actually trying to write code that matches filenames, use the glob module
# instead. See Recipe 5.13.

# ------------------------------
# 2.4. Matching and Searching for Text Patterns
# ------------------------------

# Problem
# You want to match or search text for a specific pattern.

# If the text you’re trying to match is a simple literal, you can often just use the basic string
# methods, such as str.find(), str.endswith(), str.startswith(), or similar. For
# example:

text = 'yeah, but no, but yeah, but no, but yeah'
# Exact match
print(text == 'yeah')
# False

# Startswith and endswith
print(text.startswith('yeah'))
print(text.endswith('no'))
print(text.endswith('yeah'))
# True
# False
# True

print(text.find('no'))
# 10

# For more complicated matching, use regular expressions and the re module. To illustrate
# the basic mechanics of using regular expressions, suppose you want to match dates
# specified as digits, such as “11/27/2012.” Here is a sample of how you would do it:

text1 = '11/27/2015'
text2 = 'Nov 27, 2015'

# Simple matching: \d+ means match one or more digits


if re.match(r'\d+/\d+/\d+',text1):
    print('yes')
else:
    print('no')
# yes
if re.match(r'\d+/\d+/\d+',text2):
    print('yes')
else:
    print('no')
# no

# If you’re going to perform a lot of matches using the same pattern, it usually pays to
# precompile the regular expression pattern into a pattern object first. For example:

datepat = re.compile(r'\d+/\d+/\d+')
if datepat.match(text1):
    print('yes')
else:
    print('no')
# yes
if datepat.match(text2):
    print('yes')
else:
    print('no')
# no


text3 = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
print(datepat.findall(text3))
# ['11/27/2012', '3/13/2013']

# When defining regular expressions, it is common to introduce capture groups by enclosing
# parts of the pattern in parentheses. For example:

datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
m = datepat.match('11/27/2012')
print(m)
# <_sre.SRE_Match object; span=(0, 10), match='11/27/2012'>

print(m.group(1))
print(m.group(2))
print(m.group(3))
# 11
# 27
# 2012


print(m.groups())
# ('11', '27', '2012'

month, day, year = m.groups()
print(month)
print(day)
print(year)
# 11
# 27
# 2012

# The findall() method searches the text and finds all matches, returning them as a list.
# If you want to find matches iteratively, use the finditer() method instead. For example:
for m in datepat.finditer(text3):
    print(m.groups())
# ('11', '27', '2012')
# ('3', '13', '2013')

# Discussion
# A basic tutorial on the theory of regular expressions is beyond the scope of this book.
# However, this recipe illustrates the absolute basics of using the re module to match and
# search for text. The essential functionality is first compiling a pattern using
# re.compile() and then using methods such as match(), findall(), or finditer().
# When specifying patterns, it is relatively common to use raw strings such as
# r'(\d+)/(\d+)/(\d+)'. Such strings leave the backslash character uninterpreted,
# which can be useful in the context of regular expressions. Otherwise, you need to use
# double backslashes such as '(\\d+)/(\\d+)/(\\d+)'.
# Be aware that the match() method only checks the beginning of a string. It’s possible
# that it will match things you aren’t expecting. For example:

m = datepat.match('11/27/2012abcdef')
print(m)
# print(m.group())<_sre.SRE_Match object; span=(0, 10), match='11/27/2012'>
# 11/27/2012

datepat = re.compile(r'(\d+)/(\d+)/(\d+)$')
print(datepat.match('11/27/2012abcdef'))
# None
print(datepat.match('11/27/2012'))
# <_sre.SRE_Match object; span=(0, 10), match='11/27/2012'>


# ------------------------------
# 2.5. Searching and Replacing Text
# ------------------------------

text = 'yeah, but no, but yeah, but no, but yeah'
print(text.replace('yeah','yep'))
print(text)
# yep, but no, but yep, but no, but yep
# yeah, but no, but yeah, but no, but yeah

text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
print(re.sub(r'(\d+)/(\d+)/(\d+)',r'\3-\1-\2', text))
# Today is 2012-11-27. PyCon starts 2013-3-13.

# The first argument to sub() is the pattern to match and the second argument is the
# replacement pattern. Backslashed digits such as \3 refer to capture group numbers in
# the pattern.

datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
print(datepat.sub(r'\2-\1-\3', text))
# Today is 27-11-2012. PyCon starts 13-3-2013.

def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))

print(datepat.sub(change_date, text))
# Today is 27 Nov 2012. PyCon starts 13 Mar 2013.

# If you want to know how many substitutions were made in addition to getting the
# replacement text, use re.subn() instead. For example:
newtext, n = datepat.subn(r'\3-\1-\2', text)
print(newtext)
print(n)
# Today is 2012-11-27. PyCon starts 2013-3-13.
# 2

# Discussion
# There isn’t much more to regular expression search and replace than the sub() method
# shown. The trickiest part is specifying the regular expression pattern—something that’s
# best left as an exercise to the reader.

# ------------------------------
# 2.6. Searching and Replacing Case-Insensitive Text
# ------------------------------

# Problem
# You need to search for and possibly replace text in a case-insensitive manner.

# To perform case-insensitive text operations, you need to use the re module and supply
# the re.IGNORECASE flag to various operations. For example:

text = 'UPPER PYTHON, lower python, Mixed Python'
print(re.findall('python', text, flags=re.IGNORECASE))
# ['PYTHON', 'python', 'Python']

print(re.sub('python','snake',text,flags=re.IGNORECASE))
# UPPER snake, lower snake, Mixed snake

# The last example reveals a limitation that replacing text won’t match the case of the
# matched text. If you need to fix this, you might have to use a support function, as in the
# following:

def matchcase(word):
    def replace(m):
        text = m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper:
            return word.capitalize()
        else:
            return word
    return replace

print(re.sub('python',matchcase('snake'), text, flags=re.IGNORECASE))
# UPPER SNAKE, lower snake, Mixed Snake

# For simple cases, simply providing the re.IGNORECASE is enough to perform caseinsensitive
# matching. However, be aware that this may not be enough for certain kinds
# of Unicode matching involving case folding. See Recipe 2.10 for more details.

# ------------------------------
# 2.7. Specifying a Regular Expression for the Shortest Match
# ------------------------------
# Problem
# You’re trying to match a text pattern using regular expressions, but it is identifying the
# longest possible matches of a pattern. Instead, you would like to change it to find the
# shortest possible match.

# This problem often arises in patterns that try to match text enclosed inside a pair of
# starting and ending delimiters (e.g., a quoted string). To illustrate, consider this example:
str_pat = re.compile(r'\"(.*)\"')
text1 = 'Computer says "no."'
print(str_pat.findall(text1))
# ['no.']
text2 = 'Computer says "no." Phone says "yes."'
print(str_pat.findall(text2))
# ['no." Phone says "yes.']
# In this example, the pattern r'\"(.*)\"' is attempting to match text enclosed inside
# quotes. However, the * operator in a regular expression is greedy, so matching is based
# on finding the longest possible match. Thus, in the second example involving text2, it
# incorrectly matches the two quoted strings.

# To fix this, add the ? modifier after the * operator in the pattern, like this:
str_pat = re.compile(r'\"(.*?)\"')
print(str_pat.findall(text2))
['no.', 'yes.']
# This makes the matching nongreedy, and produces the shortest match instead.
# This recipe addresses one of the more common problems encountered when writing
# regular expressions involving the dot (.) character. In a pattern, the dot matches any
# character except a newline. However, if you bracket the dot with starting and ending
# text (such as a quote), matching will try to find the longest possible match to the pattern.
# This causes multiple occurrences of the starting or ending text to be skipped altogether
# and included in the results of the longer match. Adding the ? right after operators such
# as * or + forces the matching algorithm to look for the shortest possible match instead.

# ------------------------------
# 2.8 Writing a Regular Expression for Multiline Patterns Problem
# ------------------------------
# Problem
# You’re trying to match a block of text using a regular expression, but you need the match
# to span multiple lines.

# This problem typically arises in patterns that use the dot (.) to match any character but
# forget to account for the fact that it doesn’t match newlines. For example, suppose you
# are trying to match C-style comments:

comment = re.compile(r'/\*(.*?)\*/')
text1 = '/* this is a comment */'
text2 = '''
            /* this is a
            multiline comment */
        '''

print(comment.findall(text1))
# [' this is a comment ']

print(comment.findall(text2))
# []

# To fix the problem, you can add support for newlines. For example:

comment = re.compile(r'/\*((?:.|\n)*?)\*/')
print(comment.findall(text2))
# [' this is a\n            multiline comment ']

# In this pattern, (?:.|\n) specifies a noncapture group (i.e., it defines a group for the
# purposes of matching, but that group is not captured separately or numbered).

# The re.compile() function accepts a flag, re.DOTALL, which is useful here. It makes
# the . in a regular expression match all characters, including newlines. For example:

comment = re.compile(r'/\*(.*?)\*/', re.DOTALL)
print(comment.findall(text2))
# [' this is a\n            multiline comment ']

# Using the re.DOTALL flag works fine for simple cases, but might be problematic if you’re
# working with extremely complicated patterns or a mix of separate regular expressions
# that have been combined together for the purpose of tokenizing, as described in
# Recipe 2.18. If given a choice, it’s usually better to define your regular expression pattern
# so that it works correctly without the need for extra flags.


# ------------------------------
# 2.9. Normalizing Unicode Text to a Standard Representation
# ------------------------------
# Problem
# You’re working with Unicode strings, but need to make sure that all of the strings have
# the same underlying representation.

s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'
print(s1)
print(s2)
print(s1==s2)
print(len(s1))
print(len(s2))
# Spicy Jalapeño
# Spicy Jalapeño
# False
# 14
# 15

# Here the text “Spicy Jalapeño” has been presented in two forms. The first uses the fully
# composed “ñ” character (U+00F1). The second uses the Latin letter “n” followed by a
# “~” combining character (U+0303).
# Having multiple representations is a problem for programs that compare strings. In
# order to fix this, you should first normalize the text into a standard representation using
# the unicodedata module:


t1 = unicodedata.normalize('NFC', s1)
t2 = unicodedata.normalize('NFC', s2)
print(t1==t2)
# True

t3 = unicodedata.normalize('NFD', s1)
t4 = unicodedata.normalize('NFD', s2)
print(t3==t4)
# True

print(ascii(t3))
# 'Spicy Jalape\xf1o'

# The first argument to normalize() specifies how you want the string normalized. NFC
# means that characters should be fully composed (i.e., use a single code point if possible).
# NFD means that characters should be fully decomposed with the use of combining characters.
# Python also supports the normalization forms NFKC and NFKD, which add extra compatibility
# features for dealing with certain kinds of characters. For example:

s = '\ufb01'
print(s)
# ﬁ

print(unicodedata.normalize('NFD', s))
# ﬁ

# Notice how the combined letters are broken apart here
print(unicodedata.normalize('NFKD', s))
# 'fi'
print(unicodedata.normalize('NFKC', s))
# 'fi'


# Normalization can also be an important part of sanitizing and filtering text. For example,
# suppose you want to remove all diacritical marks from some text (possibly for the purposes
# of searching or matching):
s1 = 'Spicy Jalape\u00f1o'
t1 = unicodedata.normalize('NFD',s1)
print(''.join(c for c in t1 if not unicodedata.combining(c)))
# Spicy Jalapeno
# This last example shows another important aspect of the unicodedata module—namely,
# utility functions for testing characters against character classes. The combining() function
# tests a character to see if it is a combining character. There are other functions in
# the module for finding character categories, testing digits, and so forth.
# Unicode is obviously a large topic. For more detailed reference information about normalization,
# visit Unicode’s page on the subject. Ned Batchelder has also given an excellent
# presentation on Python Unicode handling issues at his website.
# http://www.unicode.org/faq/normalization.html
# http://nedbatchelder.com/text/unipain.html


# ------------------------------
# 2.10. Working with Unicode Characters in Regular Expressions
# ------------------------------

# Problem
# You are using regular expressions to process text, but are concerned about the handling
# of Unicode characters.

num = re.compile('\d+')
# ASCII digits
print(num.match('123'))

# Arabic digits
print(num.match('\u0661\u0662\u0663'))

# Arabic code pages
arabic = re.compile('[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff]+')
arabictext = '\u200f\u0630\u064e\u0644\u0650\u0643\u064e \u0671\u0644\u0652\u0643\u0650\u062a\u064e\u0628\u064f \u0644\u064e\u0627 \u0631\u064e\u064a\u0652\u0628\u064e \u06db \u0641\u0650\u064a\u0647\u0650 \u06db \u0647\u064f\u062f\u064b\u06ed\u0649 \u0644\u0651\u0650\u0644\u0652\u0645\u064f\u062a\u0651\u064e\u0642\u0650\u064a\u0646\u064e'
print(arabic.match(arabictext))

pat = re.compile('stra\u00dfe', re.IGNORECASE)
s = 'straße'
print(pat.match(s))
# <_sre.SRE_Match object; span=(0, 6), match='straße'>
print(pat.match(s.upper()))
# None

print(s.upper())
# STRASSE

# Mixing Unicode and regular expressions is often a good way to make your head explode.
# If you’re going to do it seriously, you should consider installing the third-party regex
# library, which provides full support for Unicode case folding, as well as a variety of other
# interesting features, including approximate matching.

# ------------------------------
# 2.11. Stripping Unwanted Characters from Strings
# ------------------------------
# Problem
# You want to strip unwanted characters, such as whitespace, from the beginning, end, or
# middle of a text string.


# ------------------------------
# 2.12. Sanitizing and Cleaning Up Text
# ------------------------------
# Problem
# Some bored script kiddie has entered the text “pýtĥöñ” into a form on your web page
# and you’d like to clean it up somehow.


# ------------------------------
# 2.13. Aligning Text Strings
# ------------------------------
# Problem
# You need to format text with some sort of alignment applied.

# ------------------------------
# 2.14. Combining and Concatenating Strings
# ------------------------------
# Problem
# You want to combine many small strings together into a larger string.

# ------------------------------
# 2.15. Interpolating Variables in Strings
# ------------------------------
# Problem
# You want to create a string in which embedded variable names are substituted with a
# string representation of a variable’s value.

# ------------------------------
# 2.16. Reformatting Text to a Fixed Number of Columns
# ------------------------------
# Problem
# You have long strings that you want to reformat so that they fill a user-specified number
# of columns.

# ------------------------------
# 2.17. Handling HTML and XML Entities in Text
# ------------------------------
# Problem
# You want to replace HTML or XML entities such as &entity; or &#code; with their
# corresponding text. Alternatively, you need to produce text, but escape certain characters
# (e.g., <, >, or &).

# ------------------------------
# 2.18. Tokenizing Text
# ------------------------------
# Problem
# You have a string that you want to parse left to right into a stream of tokens.

# ------------------------------
# 2.19. Writing a Simple Recursive Descent Parser
# ------------------------------
# Problem
# You need to parse text according to a set of grammar rules and perform actions or build
# an abstract syntax tree representing the input. The grammar is small, so you’d prefer to
# just write the parser yourself as opposed to using some kind of framework.

# ------------------------------
# 2.20. Performing Text Operations on Byte Strings
# ------------------------------

# Problem
# You want to perform common text operations (e.g., stripping, searching, and replacement)
# on byte strings.