Created
September 29, 2015 11:36
-
-
Save reuf/da6b5eaa7822eaeaee4a to your computer and use it in GitHub Desktop.
String manipulation python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'muhamed.halilovic' | |
import re | |
import os | |
from urllib.request import urlopen | |
from fnmatch import fnmatch, fnmatchcase | |
from calendar import month_abbr | |
import unicodedata | |
# ------------------------------ | |
# 2.1. Splitting Strings on Any of Multiple Delimiters | |
# ------------------------------ | |
# Problem | |
# You need to split a string into fields, but the delimiters (and spacing around them) aren’t | |
# consistent throughout the string. | |
# The split() method of string objects is really meant for very simple cases, and does | |
# not allow for multiple delimiters or account for possible whitespace around the delimiters. | |
# In cases when you need a bit more flexibility, use the re.split() method: | |
line = 'asdf fjdk; afed, fjek,asdf, foo' | |
split1 = re.split(r'[;,\s]\s*', line) | |
split2 = re.split(r'(;|,|\s)\s*',line) | |
print(split1) | |
print(split2) | |
# ['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo'] | |
# ['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo'] | |
values = split2[::2] | |
delimiters = split2[1::2]+[''] | |
print(values) | |
print(delimiters) | |
# ['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo'] | |
# [' ', ';', ',', ',', ',', ''] | |
# Reform the line using the same delimiters | |
reformed = ''.join(v+d for v,d in zip(values, delimiters)) | |
print(reformed) | |
# 'asdf fjdk;afed,fjek,asdf,foo' | |
# If you don’t want the separator characters in the result, but still need to use parentheses | |
# to group parts of the regular expression pattern, make sure you use a noncapture group, | |
# specified as (?:...). For example: | |
result = re.split(r'(?:,|;|\s)\s*',line) | |
print(result) | |
# ['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo'] | |
# ------------------------------ | |
# 2.2. Matching Text at the Start or End of a String | |
# ------------------------------ | |
# Problem | |
# You need to check the start or end of a string for specific text patterns, such as filename | |
# extensions, URL schemes, and so on. | |
# A simple way to check the beginning or end of a string is to use the str.starts | |
# with() or str.endswith() methods. For example: | |
filename = 'spam.txt' | |
print(filename.endswith('.txt')) | |
# True | |
print(filename.startswith('file:')) | |
# False | |
url = 'http://www.staima.com' | |
print(url.startswith('http://www.')) | |
# True | |
# If you need to check against multiple choices, simply provide a tuple of possibilities to | |
# startswith() or endswith(): | |
filenames = os.listdir('.') | |
print(filenames) | |
print([name for name in filenames if name.endswith(('.git','.py'))]) | |
# ['.git', '.idea', '01_data_structures_algos.py', '02_strings_text.py', 'README.md', 'somefile1_3.txt'] | |
# ['.git', '01_data_structures_algos.py', '02_strings_text.py'] | |
print(any(name.endswith('.py') for name in filenames)) | |
# True | |
def read_data(name): | |
if name.startswith(('http:','https:','ftp:')): | |
return urlopen(name).read() | |
else: | |
with open(name) as f: | |
return f.read() | |
# Oddly, this is one part of Python where a tuple is actually required as input. If you happen | |
# to have the choices specified in a list or set, just make sure you convert them using | |
# tuple() first. For example: | |
choices = ['http:','ftp:'] | |
url = 'http://www.python.org' | |
print(url.startswith(tuple(choices))) | |
# True | |
# Similar operations can be performed with slices, but | |
# are far less elegant. For example: | |
filename = 'spam.txt' | |
print(filename[-4:] == '.txt') | |
# True | |
url = 'http://www.python.org' | |
print(url[:5] == 'http:' or url[:6] == 'https:' or url[:4] == 'ftp:' ) | |
# True | |
# You might also be inclined to use regular expressions as an alternative. For example: | |
url = 'http://www.python.org' | |
print(re.match('http:|https:|ftp:', url)) | |
# <_sre.SRE_Match object; span=(0, 5), match='http:'> | |
# This works, but is often overkill for simple matching. Using this recipe is simpler and | |
# runs faster. | |
# Last, but not least, the startswith() and endswith() methods look nice when combined | |
# with other operations, such as common data reductions. For example, this statement | |
# that checks a directory for the presence of certain kinds of files: | |
# if any(name.endswith(('.git','.py')) for name in listdir('')): | |
# ... | |
# ------------------------------ | |
# 2.3. Matching Strings Using Shell Wildcard Patterns | |
# ------------------------------ | |
# Problem | |
# You want to match text using the same wildcard patterns as are commonly used when | |
# working in Unix shells (e.g., *.py, Dat[0-9]*.csv, etc.). | |
print(fnmatch('foo.txt','*.txt')) | |
#True | |
print(fnmatch('foo.txt','?oo.txt')) | |
# True | |
print(fnmatch('Dat45.csv','Dat[0-9]*')) | |
# True | |
names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py'] | |
print([name for name in names if fnmatch(name,'Dat*.csv')]) | |
# ['Dat1.csv', 'Dat2.csv'] | |
# Normally, fnmatch() matches patterns using the same case-sensitivity rules as the system’s | |
# underlying filesystem (which varies based on operating system). For example: | |
# On OSX or Linux | |
print(fnmatch('foo.txt','.TXT')) | |
# False | |
# On Windows: | |
# True | |
# fnmatchcase() for exact matching lowercase, uppercase, etc. | |
addresses = [ | |
'5412 N CLARK ST', | |
'1060 W ADDISON ST', | |
'1039 W GRANVILLE AVE', | |
'2122 N CLARK ST', | |
'4802 N BROADWAY', | |
] | |
print([addr for addr in addresses if fnmatch(addr, '* ST')]) | |
# ['5412 N CLARK ST', '1060 W ADDISON ST', '2122 N CLARK ST'] | |
print([addr for addr in addresses if fnmatch(addr, '54[0-9][0-9] *CLARK*')]) | |
# ['5412 N CLARK ST'] | |
# The matching performed by fnmatch sits somewhere between the functionality of simple | |
# string methods and the full power of regular expressions. If you’re just trying to | |
# provide a simple mechanism for allowing wildcards in data processing operations, it’s | |
# often a reasonable solution. | |
# If you’re actually trying to write code that matches filenames, use the glob module | |
# instead. See Recipe 5.13. | |
# ------------------------------ | |
# 2.4. Matching and Searching for Text Patterns | |
# ------------------------------ | |
# Problem | |
# You want to match or search text for a specific pattern. | |
# If the text you’re trying to match is a simple literal, you can often just use the basic string | |
# methods, such as str.find(), str.endswith(), str.startswith(), or similar. For | |
# example: | |
text = 'yeah, but no, but yeah, but no, but yeah' | |
# Exact match | |
print(text == 'yeah') | |
# False | |
# Startswith and endswith | |
print(text.startswith('yeah')) | |
print(text.endswith('no')) | |
print(text.endswith('yeah')) | |
# True | |
# False | |
# True | |
print(text.find('no')) | |
# 10 | |
# For more complicated matching, use regular expressions and the re module. To illustrate | |
# the basic mechanics of using regular expressions, suppose you want to match dates | |
# specified as digits, such as “11/27/2012.” Here is a sample of how you would do it: | |
text1 = '11/27/2015' | |
text2 = 'Nov 27, 2015' | |
# Simple matching: \d+ means match one or more digits | |
if re.match(r'\d+/\d+/\d+',text1): | |
print('yes') | |
else: | |
print('no') | |
# yes | |
if re.match(r'\d+/\d+/\d+',text2): | |
print('yes') | |
else: | |
print('no') | |
# no | |
# If you’re going to perform a lot of matches using the same pattern, it usually pays to | |
# precompile the regular expression pattern into a pattern object first. For example: | |
datepat = re.compile(r'\d+/\d+/\d+') | |
if datepat.match(text1): | |
print('yes') | |
else: | |
print('no') | |
# yes | |
if datepat.match(text2): | |
print('yes') | |
else: | |
print('no') | |
# no | |
text3 = 'Today is 11/27/2012. PyCon starts 3/13/2013.' | |
print(datepat.findall(text3)) | |
# ['11/27/2012', '3/13/2013'] | |
# When defining regular expressions, it is common to introduce capture groups by enclosing | |
# parts of the pattern in parentheses. For example: | |
datepat = re.compile(r'(\d+)/(\d+)/(\d+)') | |
m = datepat.match('11/27/2012') | |
print(m) | |
# <_sre.SRE_Match object; span=(0, 10), match='11/27/2012'> | |
print(m.group(1)) | |
print(m.group(2)) | |
print(m.group(3)) | |
# 11 | |
# 27 | |
# 2012 | |
print(m.groups()) | |
# ('11', '27', '2012' | |
month, day, year = m.groups() | |
print(month) | |
print(day) | |
print(year) | |
# 11 | |
# 27 | |
# 2012 | |
# The findall() method searches the text and finds all matches, returning them as a list. | |
# If you want to find matches iteratively, use the finditer() method instead. For example: | |
for m in datepat.finditer(text3): | |
print(m.groups()) | |
# ('11', '27', '2012') | |
# ('3', '13', '2013') | |
# Discussion | |
# A basic tutorial on the theory of regular expressions is beyond the scope of this book. | |
# However, this recipe illustrates the absolute basics of using the re module to match and | |
# search for text. The essential functionality is first compiling a pattern using | |
# re.compile() and then using methods such as match(), findall(), or finditer(). | |
# When specifying patterns, it is relatively common to use raw strings such as | |
# r'(\d+)/(\d+)/(\d+)'. Such strings leave the backslash character uninterpreted, | |
# which can be useful in the context of regular expressions. Otherwise, you need to use | |
# double backslashes such as '(\\d+)/(\\d+)/(\\d+)'. | |
# Be aware that the match() method only checks the beginning of a string. It’s possible | |
# that it will match things you aren’t expecting. For example: | |
m = datepat.match('11/27/2012abcdef') | |
print(m) | |
# print(m.group())<_sre.SRE_Match object; span=(0, 10), match='11/27/2012'> | |
# 11/27/2012 | |
datepat = re.compile(r'(\d+)/(\d+)/(\d+)$') | |
print(datepat.match('11/27/2012abcdef')) | |
# None | |
print(datepat.match('11/27/2012')) | |
# <_sre.SRE_Match object; span=(0, 10), match='11/27/2012'> | |
# ------------------------------ | |
# 2.5. Searching and Replacing Text | |
# ------------------------------ | |
text = 'yeah, but no, but yeah, but no, but yeah' | |
print(text.replace('yeah','yep')) | |
print(text) | |
# yep, but no, but yep, but no, but yep | |
# yeah, but no, but yeah, but no, but yeah | |
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.' | |
print(re.sub(r'(\d+)/(\d+)/(\d+)',r'\3-\1-\2', text)) | |
# Today is 2012-11-27. PyCon starts 2013-3-13. | |
# The first argument to sub() is the pattern to match and the second argument is the | |
# replacement pattern. Backslashed digits such as \3 refer to capture group numbers in | |
# the pattern. | |
datepat = re.compile(r'(\d+)/(\d+)/(\d+)') | |
print(datepat.sub(r'\2-\1-\3', text)) | |
# Today is 27-11-2012. PyCon starts 13-3-2013. | |
def change_date(m): | |
mon_name = month_abbr[int(m.group(1))] | |
return '{} {} {}'.format(m.group(2), mon_name, m.group(3)) | |
print(datepat.sub(change_date, text)) | |
# Today is 27 Nov 2012. PyCon starts 13 Mar 2013. | |
# If you want to know how many substitutions were made in addition to getting the | |
# replacement text, use re.subn() instead. For example: | |
newtext, n = datepat.subn(r'\3-\1-\2', text) | |
print(newtext) | |
print(n) | |
# Today is 2012-11-27. PyCon starts 2013-3-13. | |
# 2 | |
# Discussion | |
# There isn’t much more to regular expression search and replace than the sub() method | |
# shown. The trickiest part is specifying the regular expression pattern—something that’s | |
# best left as an exercise to the reader. | |
# ------------------------------ | |
# 2.6. Searching and Replacing Case-Insensitive Text | |
# ------------------------------ | |
# Problem | |
# You need to search for and possibly replace text in a case-insensitive manner. | |
# To perform case-insensitive text operations, you need to use the re module and supply | |
# the re.IGNORECASE flag to various operations. For example: | |
text = 'UPPER PYTHON, lower python, Mixed Python' | |
print(re.findall('python', text, flags=re.IGNORECASE)) | |
# ['PYTHON', 'python', 'Python'] | |
print(re.sub('python','snake',text,flags=re.IGNORECASE)) | |
# UPPER snake, lower snake, Mixed snake | |
# The last example reveals a limitation that replacing text won’t match the case of the | |
# matched text. If you need to fix this, you might have to use a support function, as in the | |
# following: | |
def matchcase(word): | |
def replace(m): | |
text = m.group() | |
if text.isupper(): | |
return word.upper() | |
elif text.islower(): | |
return word.lower() | |
elif text[0].isupper: | |
return word.capitalize() | |
else: | |
return word | |
return replace | |
print(re.sub('python',matchcase('snake'), text, flags=re.IGNORECASE)) | |
# UPPER SNAKE, lower snake, Mixed Snake | |
# For simple cases, simply providing the re.IGNORECASE is enough to perform caseinsensitive | |
# matching. However, be aware that this may not be enough for certain kinds | |
# of Unicode matching involving case folding. See Recipe 2.10 for more details. | |
# ------------------------------ | |
# 2.7. Specifying a Regular Expression for the Shortest Match | |
# ------------------------------ | |
# Problem | |
# You’re trying to match a text pattern using regular expressions, but it is identifying the | |
# longest possible matches of a pattern. Instead, you would like to change it to find the | |
# shortest possible match. | |
# This problem often arises in patterns that try to match text enclosed inside a pair of | |
# starting and ending delimiters (e.g., a quoted string). To illustrate, consider this example: | |
str_pat = re.compile(r'\"(.*)\"') | |
text1 = 'Computer says "no."' | |
print(str_pat.findall(text1)) | |
# ['no.'] | |
text2 = 'Computer says "no." Phone says "yes."' | |
print(str_pat.findall(text2)) | |
# ['no." Phone says "yes.'] | |
# In this example, the pattern r'\"(.*)\"' is attempting to match text enclosed inside | |
# quotes. However, the * operator in a regular expression is greedy, so matching is based | |
# on finding the longest possible match. Thus, in the second example involving text2, it | |
# incorrectly matches the two quoted strings. | |
# To fix this, add the ? modifier after the * operator in the pattern, like this: | |
str_pat = re.compile(r'\"(.*?)\"') | |
print(str_pat.findall(text2)) | |
['no.', 'yes.'] | |
# This makes the matching nongreedy, and produces the shortest match instead. | |
# This recipe addresses one of the more common problems encountered when writing | |
# regular expressions involving the dot (.) character. In a pattern, the dot matches any | |
# character except a newline. However, if you bracket the dot with starting and ending | |
# text (such as a quote), matching will try to find the longest possible match to the pattern. | |
# This causes multiple occurrences of the starting or ending text to be skipped altogether | |
# and included in the results of the longer match. Adding the ? right after operators such | |
# as * or + forces the matching algorithm to look for the shortest possible match instead. | |
# ------------------------------ | |
# 2.8 Writing a Regular Expression for Multiline Patterns Problem | |
# ------------------------------ | |
# Problem | |
# You’re trying to match a block of text using a regular expression, but you need the match | |
# to span multiple lines. | |
# This problem typically arises in patterns that use the dot (.) to match any character but | |
# forget to account for the fact that it doesn’t match newlines. For example, suppose you | |
# are trying to match C-style comments: | |
comment = re.compile(r'/\*(.*?)\*/') | |
text1 = '/* this is a comment */' | |
text2 = ''' | |
/* this is a | |
multiline comment */ | |
''' | |
print(comment.findall(text1)) | |
# [' this is a comment '] | |
print(comment.findall(text2)) | |
# [] | |
# To fix the problem, you can add support for newlines. For example: | |
comment = re.compile(r'/\*((?:.|\n)*?)\*/') | |
print(comment.findall(text2)) | |
# [' this is a\n multiline comment '] | |
# In this pattern, (?:.|\n) specifies a noncapture group (i.e., it defines a group for the | |
# purposes of matching, but that group is not captured separately or numbered). | |
# The re.compile() function accepts a flag, re.DOTALL, which is useful here. It makes | |
# the . in a regular expression match all characters, including newlines. For example: | |
comment = re.compile(r'/\*(.*?)\*/', re.DOTALL) | |
print(comment.findall(text2)) | |
# [' this is a\n multiline comment '] | |
# Using the re.DOTALL flag works fine for simple cases, but might be problematic if you’re | |
# working with extremely complicated patterns or a mix of separate regular expressions | |
# that have been combined together for the purpose of tokenizing, as described in | |
# Recipe 2.18. If given a choice, it’s usually better to define your regular expression pattern | |
# so that it works correctly without the need for extra flags. | |
# ------------------------------ | |
# 2.9. Normalizing Unicode Text to a Standard Representation | |
# ------------------------------ | |
# Problem | |
# You’re working with Unicode strings, but need to make sure that all of the strings have | |
# the same underlying representation. | |
s1 = 'Spicy Jalape\u00f1o' | |
s2 = 'Spicy Jalapen\u0303o' | |
print(s1) | |
print(s2) | |
print(s1==s2) | |
print(len(s1)) | |
print(len(s2)) | |
# Spicy Jalapeño | |
# Spicy Jalapeño | |
# False | |
# 14 | |
# 15 | |
# Here the text “Spicy Jalapeño” has been presented in two forms. The first uses the fully | |
# composed “ñ” character (U+00F1). The second uses the Latin letter “n” followed by a | |
# “~” combining character (U+0303). | |
# Having multiple representations is a problem for programs that compare strings. In | |
# order to fix this, you should first normalize the text into a standard representation using | |
# the unicodedata module: | |
t1 = unicodedata.normalize('NFC', s1) | |
t2 = unicodedata.normalize('NFC', s2) | |
print(t1==t2) | |
# True | |
t3 = unicodedata.normalize('NFD', s1) | |
t4 = unicodedata.normalize('NFD', s2) | |
print(t3==t4) | |
# True | |
print(ascii(t3)) | |
# 'Spicy Jalape\xf1o' | |
# The first argument to normalize() specifies how you want the string normalized. NFC | |
# means that characters should be fully composed (i.e., use a single code point if possible). | |
# NFD means that characters should be fully decomposed with the use of combining characters. | |
# Python also supports the normalization forms NFKC and NFKD, which add extra compatibility | |
# features for dealing with certain kinds of characters. For example: | |
s = '\ufb01' | |
print(s) | |
# fi | |
print(unicodedata.normalize('NFD', s)) | |
# fi | |
# Notice how the combined letters are broken apart here | |
print(unicodedata.normalize('NFKD', s)) | |
# 'fi' | |
print(unicodedata.normalize('NFKC', s)) | |
# 'fi' | |
# Normalization can also be an important part of sanitizing and filtering text. For example, | |
# suppose you want to remove all diacritical marks from some text (possibly for the purposes | |
# of searching or matching): | |
s1 = 'Spicy Jalape\u00f1o' | |
t1 = unicodedata.normalize('NFD',s1) | |
print(''.join(c for c in t1 if not unicodedata.combining(c))) | |
# Spicy Jalapeno | |
# This last example shows another important aspect of the unicodedata module—namely, | |
# utility functions for testing characters against character classes. The combining() function | |
# tests a character to see if it is a combining character. There are other functions in | |
# the module for finding character categories, testing digits, and so forth. | |
# Unicode is obviously a large topic. For more detailed reference information about normalization, | |
# visit Unicode’s page on the subject. Ned Batchelder has also given an excellent | |
# presentation on Python Unicode handling issues at his website. | |
# http://www.unicode.org/faq/normalization.html | |
# http://nedbatchelder.com/text/unipain.html | |
# ------------------------------ | |
# 2.10. Working with Unicode Characters in Regular Expressions | |
# ------------------------------ | |
# Problem | |
# You are using regular expressions to process text, but are concerned about the handling | |
# of Unicode characters. | |
num = re.compile('\d+') | |
# ASCII digits | |
print(num.match('123')) | |
# Arabic digits | |
print(num.match('\u0661\u0662\u0663')) | |
# Arabic code pages | |
arabic = re.compile('[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff]+') | |
arabictext = '\u200f\u0630\u064e\u0644\u0650\u0643\u064e \u0671\u0644\u0652\u0643\u0650\u062a\u064e\u0628\u064f \u0644\u064e\u0627 \u0631\u064e\u064a\u0652\u0628\u064e \u06db \u0641\u0650\u064a\u0647\u0650 \u06db \u0647\u064f\u062f\u064b\u06ed\u0649 \u0644\u0651\u0650\u0644\u0652\u0645\u064f\u062a\u0651\u064e\u0642\u0650\u064a\u0646\u064e' | |
print(arabic.match(arabictext)) | |
pat = re.compile('stra\u00dfe', re.IGNORECASE) | |
s = 'straße' | |
print(pat.match(s)) | |
# <_sre.SRE_Match object; span=(0, 6), match='straße'> | |
print(pat.match(s.upper())) | |
# None | |
print(s.upper()) | |
# STRASSE | |
# Mixing Unicode and regular expressions is often a good way to make your head explode. | |
# If you’re going to do it seriously, you should consider installing the third-party regex | |
# library, which provides full support for Unicode case folding, as well as a variety of other | |
# interesting features, including approximate matching. | |
# ------------------------------ | |
# 2.11. Stripping Unwanted Characters from Strings | |
# ------------------------------ | |
# Problem | |
# You want to strip unwanted characters, such as whitespace, from the beginning, end, or | |
# middle of a text string. | |
# ------------------------------ | |
# 2.12. Sanitizing and Cleaning Up Text | |
# ------------------------------ | |
# Problem | |
# Some bored script kiddie has entered the text “pýtĥöñ” into a form on your web page | |
# and you’d like to clean it up somehow. | |
# ------------------------------ | |
# 2.13. Aligning Text Strings | |
# ------------------------------ | |
# Problem | |
# You need to format text with some sort of alignment applied. | |
# ------------------------------ | |
# 2.14. Combining and Concatenating Strings | |
# ------------------------------ | |
# Problem | |
# You want to combine many small strings together into a larger string. | |
# ------------------------------ | |
# 2.15. Interpolating Variables in Strings | |
# ------------------------------ | |
# Problem | |
# You want to create a string in which embedded variable names are substituted with a | |
# string representation of a variable’s value. | |
# ------------------------------ | |
# 2.16. Reformatting Text to a Fixed Number of Columns | |
# ------------------------------ | |
# Problem | |
# You have long strings that you want to reformat so that they fill a user-specified number | |
# of columns. | |
# ------------------------------ | |
# 2.17. Handling HTML and XML Entities in Text | |
# ------------------------------ | |
# Problem | |
# You want to replace HTML or XML entities such as &entity; or &#code; with their | |
# corresponding text. Alternatively, you need to produce text, but escape certain characters | |
# (e.g., <, >, or &). | |
# ------------------------------ | |
# 2.18. Tokenizing Text | |
# ------------------------------ | |
# Problem | |
# You have a string that you want to parse left to right into a stream of tokens. | |
# ------------------------------ | |
# 2.19. Writing a Simple Recursive Descent Parser | |
# ------------------------------ | |
# Problem | |
# You need to parse text according to a set of grammar rules and perform actions or build | |
# an abstract syntax tree representing the input. The grammar is small, so you’d prefer to | |
# just write the parser yourself as opposed to using some kind of framework. | |
# ------------------------------ | |
# 2.20. Performing Text Operations on Byte Strings | |
# ------------------------------ | |
# Problem | |
# You want to perform common text operations (e.g., stripping, searching, and replacement) | |
# on byte strings. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment