Last active
August 29, 2015 14:17
-
-
Save e3krisztian/458e486d5d937246906e to your computer and use it in GitHub Desktop.
amc6
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:3b36f2d6af5111b8c1ec8af58cc6a5a3e4fb34050d2c2ad4cf4f1d1a4236673f" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import os\n", | |
"print os.getcwd()\n", | |
"os.listdir('.')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"/home/kr/ceu/adatmesterseg\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 1, | |
"text": [ | |
"['.ipynb_checkpoints',\n", | |
" 'data',\n", | |
" 'README.md',\n", | |
" 'code',\n", | |
" 'LICENSE',\n", | |
" 'class6.ipynb',\n", | |
" 'assignment',\n", | |
" '.git']" | |
] | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"with open('data/private-cities.txt') as f:\n", | |
" lines = f.readlines()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# headlines 1\n", | |
"[line for line in lines if len(line) < 80]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 3, | |
"text": [ | |
"['Private Cities 101\\n',\n", | |
" 'The 21st century will be the century of cities.\\n',\n", | |
" 'Proprietary communities\\n',\n", | |
" 'The problem\\n',\n", | |
" 'Police and justice provision\\n',\n", | |
" 'Institutional change\\n']" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# headlines 2\n", | |
"[line for line in lines if '.' not in line]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"['Private Cities 101\\n',\n", | |
" 'Proprietary communities\\n',\n", | |
" 'The problem\\n',\n", | |
" 'Police and justice provision\\n',\n", | |
" 'Institutional change\\n']" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# beolvasas soronkent mashogy\n", | |
"with open('data/private-cities.txt') as f:\n", | |
" sections = [line for line in f if len(line) < 80]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"sections" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 6, | |
"text": [ | |
"['Private Cities 101\\n',\n", | |
" 'The 21st century will be the century of cities.\\n',\n", | |
" 'Proprietary communities\\n',\n", | |
" 'The problem\\n',\n", | |
" 'Police and justice provision\\n',\n", | |
" 'Institutional change\\n']" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# karakterek szama\n", | |
"with open('data/private-cities.txt') as f:\n", | |
" print len(f.read())" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"15897\n" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# sorok szama\n", | |
"with open('data/private-cities.txt') as f:\n", | |
" print len(f.readlines())" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"46\n" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# szavak szama\n", | |
"with open('data/private-cities.txt') as f:\n", | |
" words = f.read().split()\n", | |
"\n", | |
"print len(words)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"2497\n" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# leghosszabb szo hossza\n", | |
"max_length = max(len(w) for w in words)\n", | |
"print max_length" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"17\n" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# leghosszabb szo/szavak\n", | |
"[word for word in words if len(word) == max_length]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 11, | |
"text": [ | |
"['self-replication.']" | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# jo az elozo megoldas?\n", | |
"long_words4 = [word for word in words if len(word) == max_length - 4]\n", | |
"long_words4" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 12, | |
"text": [ | |
"['Understanding',\n", | |
" 'institutions,',\n", | |
" 'Disentangling',\n", | |
" 'institutional',\n", | |
" 'intervention.',\n", | |
" 'traditionally',\n", | |
" 'developer\\xe2\\x80\\x99s',\n", | |
" 'demonstrating',\n", | |
" 'alternatives.',\n", | |
" 'Schools\\xe2\\x80\\x94not',\n", | |
" 'corresponding',\n", | |
" 'international',\n", | |
" 'international',\n", | |
" 'Institutional',\n", | |
" 'institutional',\n", | |
" 'expropriation',\n", | |
" 'institutional',\n", | |
" 'privatization',\n", | |
" 'independently',\n", | |
" 'institutional',\n", | |
" 'Institutional',\n", | |
" 'concentrated.',\n", | |
" 'concentrating',\n", | |
" 'institutional',\n", | |
" '\\xe2\\x80\\x9cAbrogating',\n", | |
" 'institutional']" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# ismetlodesek!\n", | |
"set(long_words4)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 13, | |
"text": [ | |
"{'Disentangling',\n", | |
" 'Institutional',\n", | |
" 'Schools\\xe2\\x80\\x94not',\n", | |
" 'Understanding',\n", | |
" 'alternatives.',\n", | |
" 'concentrated.',\n", | |
" 'concentrating',\n", | |
" 'corresponding',\n", | |
" 'demonstrating',\n", | |
" 'developer\\xe2\\x80\\x99s',\n", | |
" 'expropriation',\n", | |
" 'independently',\n", | |
" 'institutional',\n", | |
" 'institutions,',\n", | |
" 'international',\n", | |
" 'intervention.',\n", | |
" 'privatization',\n", | |
" 'traditionally',\n", | |
" '\\xe2\\x80\\x9cAbrogating'}" | |
] | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# leggyakoribb szavak\n", | |
"word_count = {}\n", | |
"for word in words:\n", | |
" if word in word_count:\n", | |
" previous_count = word_count[word]\n", | |
" else:\n", | |
" previous_count = 0\n", | |
" word_count[word] = previous_count + 1\n", | |
"\n", | |
"count_to_words = {}\n", | |
"\n", | |
"for word, count in word_count.items():\n", | |
" if count in count_to_words:\n", | |
" word_list = count_to_words[count]\n", | |
" else:\n", | |
" word_list = []\n", | |
" count_to_words[count] = word_list\n", | |
" word_list.append(word)\n", | |
"\n", | |
"highest_count = max(count_to_words)\n", | |
"print highest_count, count_to_words[highest_count]\n", | |
"high_counts = sorted(count_to_words, reverse=True)[:20]\n", | |
"print high_counts\n", | |
"for count in high_counts:\n", | |
" print count, count_to_words[count]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"129 ['the']\n", | |
"[129, 95, 74, 56, 51, 41, 35, 30, 28, 27, 26, 22, 20, 19, 17, 15, 14, 13, 12, 11]\n", | |
"129 ['the']\n", | |
"95 ['of']\n", | |
"74 ['to']\n", | |
"56 ['a']\n", | |
"51 ['is']\n", | |
"41 ['and']\n", | |
"35 ['are']\n", | |
"30 ['in']\n", | |
"28 ['that']\n", | |
"27 ['cities']\n", | |
"26 ['public']\n", | |
"22 ['private']\n", | |
"20 ['they']\n", | |
"19 ['for']\n", | |
"17 ['by', 'on']\n", | |
"15 ['would']\n", | |
"14 ['not', 'The', 'people']\n", | |
"13 ['economic', 'as']\n", | |
"12 ['more']\n", | |
"11 ['change']\n" | |
] | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"with open('data/private-cities.txt') as f:\n", | |
" text = f.read()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"'!' in text" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 16, | |
"text": [ | |
"False" | |
] | |
} | |
], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"'?' in text" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 17, | |
"text": [ | |
"True" | |
] | |
} | |
], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"text.splitlines()[:5]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 18, | |
"text": [ | |
"['Private Cities 101',\n", | |
" 'The 21st century will be the century of cities.',\n", | |
" 'Over the next 30 years, 1.8 billion people are expected to move to cities in developing countries. While some will add to existing cities, others will migrate to small towns, transforming them into the megapolises of tomorrow. Shenzhen, for example, was a small fishing village of 300,000 people in 1980. Since being designated a special economic zone that year, it has grown to over 10 million inhabitants.',\n", | |
" 'Understanding the best form of city governance will be crucial to ensuring that the emigrants lead good lives. However, even as economics has moved to focus on institutions, the literature on cities has focused instead on policy outcomes, rent control, zoning, and public transportation.',\n", | |
" 'The process of governance is important for two reasons. First, we cannot know what the ideal policy is. Constraints differ in time and place. Second, even with omniscient mayors knowing ideal policies, there is little reason to expect them to implement those ideal policies.']" | |
] | |
} | |
], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print lines[5]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"So rather than focusing on outcomes, we should focus on how to achieve those outcomes. What conditions are necessary to produce the optimal amount of public goods in a city? Asking what is the ideal level of police, street sweepers, and garbage men is just as absurd as asking, \"what is the ideal amount of shoe production?\" We simply don\u2019t know. Markets constantly adjust between supply and demand, seeking this ideal level.\n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 19 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# sentences\n", | |
"def sentences(lines):\n", | |
" separations=(\n", | |
" ('. ', '.'),\n", | |
" ('? ', '?'),\n", | |
" )\n", | |
" for separator, tail in separations:\n", | |
" sentences = []\n", | |
" for line in lines:\n", | |
" sentences += split_by_separator(line, separator, tail)\n", | |
" lines = sentences\n", | |
" return lines\n", | |
"\n", | |
"def split_by_separator(line, separator, tail):\n", | |
" fragments = line.split(separator)\n", | |
" sentences = [\n", | |
" s + tail\n", | |
" for s in fragments[:-1]\n", | |
" ] + fragments[-1:]\n", | |
" return sentences\n", | |
" \n", | |
"sentences(lines)[:20]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 20, | |
"text": [ | |
"['Private Cities 101\\n',\n", | |
" 'The 21st century will be the century of cities.\\n',\n", | |
" 'Over the next 30 years, 1.8 billion people are expected to move to cities in developing countries.',\n", | |
" 'While some will add to existing cities, others will migrate to small towns, transforming them into the megapolises of tomorrow.',\n", | |
" 'Shenzhen, for example, was a small fishing village of 300,000 people in 1980.',\n", | |
" 'Since being designated a special economic zone that year, it has grown to over 10 million inhabitants.\\n',\n", | |
" 'Understanding the best form of city governance will be crucial to ensuring that the emigrants lead good lives.',\n", | |
" 'However, even as economics has moved to focus on institutions, the literature on cities has focused instead on policy outcomes, rent control, zoning, and public transportation.\\n',\n", | |
" 'The process of governance is important for two reasons.',\n", | |
" 'First, we cannot know what the ideal policy is.',\n", | |
" 'Constraints differ in time and place.',\n", | |
" 'Second, even with omniscient mayors knowing ideal policies, there is little reason to expect them to implement those ideal policies.\\n',\n", | |
" 'So rather than focusing on outcomes, we should focus on how to achieve those outcomes.',\n", | |
" 'What conditions are necessary to produce the optimal amount of public goods in a city?',\n", | |
" 'Asking what is the ideal level of police, street sweepers, and garbage men is just as absurd as asking, \"what is the ideal amount of shoe production?\" We simply don\\xe2\\x80\\x99t know.',\n", | |
" 'Markets constantly adjust between supply and demand, seeking this ideal level.\\n',\n", | |
" 'Of course, cities are not like shoes.',\n", | |
" 'They are far more complex.',\n", | |
" 'Disentangling the marginal benefits of public transportation, the police, or garbage disposal is extremely difficult.',\n", | |
" 'Further, as cities are spatially oriented, the application of the laws of economics differs from how we usually think of economic goods.']" | |
] | |
} | |
], | |
"prompt_number": 20 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Homework" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- Reformat the text, so that there are two spaces before sections, one after section start, and one between paragraphs.\n", | |
"- Report the number of sentences per section.\n", | |
"- Write sections into file named *{two-digit-section-number}-{section-name}.txt* e.g. *01-Private-Cities-101.txt*" | |
] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[1]: | |
import os | |
print os.getcwd() | |
os.listdir('.') | |
# In[2]: | |
with open('data/private-cities.txt') as f: | |
lines = f.readlines() | |
# In[3]: | |
# headlines 1 | |
[line for line in lines if len(line) < 80] | |
# In[4]: | |
# headlines 2 | |
[line for line in lines if '.' not in line] | |
# In[5]: | |
# beolvasas soronkent mashogy | |
with open('data/private-cities.txt') as f: | |
sections = [line for line in f if len(line) < 80] | |
# In[6]: | |
sections | |
# In[7]: | |
# karakterek szama | |
with open('data/private-cities.txt') as f: | |
print len(f.read()) | |
# In[8]: | |
# sorok szama | |
with open('data/private-cities.txt') as f: | |
print len(f.readlines()) | |
# In[9]: | |
# szavak szama | |
with open('data/private-cities.txt') as f: | |
words = f.read().split() | |
print len(words) | |
# In[10]: | |
# leghosszabb szo hossza | |
max_length = max(len(w) for w in words) | |
print max_length | |
# In[11]: | |
# leghosszabb szo/szavak | |
[word for word in words if len(word) == max_length] | |
# In[12]: | |
# jo az elozo megoldas? | |
long_words4 = [word for word in words if len(word) == max_length - 4] | |
long_words4 | |
# In[13]: | |
# ismetlodesek! | |
set(long_words4) | |
# In[14]: | |
# leggyakoribb szavak | |
word_count = {} | |
for word in words: | |
if word in word_count: | |
previous_count = word_count[word] | |
else: | |
previous_count = 0 | |
word_count[word] = previous_count + 1 | |
count_to_words = {} | |
for word, count in word_count.items(): | |
if count in count_to_words: | |
word_list = count_to_words[count] | |
else: | |
word_list = [] | |
count_to_words[count] = word_list | |
word_list.append(word) | |
highest_count = max(count_to_words) | |
print highest_count, count_to_words[highest_count] | |
high_counts = sorted(count_to_words, reverse=True)[:20] | |
print high_counts | |
for count in high_counts: | |
print count, count_to_words[count] | |
# In[15]: | |
with open('data/private-cities.txt') as f: | |
text = f.read() | |
# In[16]: | |
'!' in text | |
# In[17]: | |
'?' in text | |
# In[18]: | |
text.splitlines()[:5] | |
# In[19]: | |
print lines[5] | |
# In[20]: | |
# sentences | |
def sentences(lines): | |
separations=( | |
('. ', '.'), | |
('? ', '?'), | |
) | |
for separator, tail in separations: | |
sentences = [] | |
for line in lines: | |
sentences += split_by_separator(line, separator, tail) | |
lines = sentences | |
return lines | |
def split_by_separator(line, separator, tail): | |
fragments = line.split(separator) | |
sentences = [ | |
s + tail | |
for s in fragments[:-1] | |
] + fragments[-1:] | |
return sentences | |
sentences(lines)[:20] | |
## Homework | |
# - Reformat the text, so that there are two spaces before sections, one after section start, and one between paragraphs. | |
# - Report the number of sentences per section. | |
# - Write sections into file named *{two-digit-section-number}-{section-name}.txt* e.g. *01-Private-Cities-101.txt* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tools | |
ipython | |
- live environment | |
- completion on names | |
- help on names | |
ipython notebook | |
- ipython in browser | |
- later in course | |
http://pythontutor.com/visualize.html#mode=edit | |
dict (map, mapping) | |
key -> value | |
- keys are immutable - tuples vs lists | |
- value - anything | |
- create | |
- empty: {} | |
- {'a': 1, 1: 'a'} | |
- dict(a=2, b=4) | |
- dict( | |
( | |
(1, 2), | |
(3, 4), | |
(5, 6) | |
) | |
) | |
- value access - normal indexing: | |
d[key] | |
- setting value: | |
d[key] = value | |
- deleting value: | |
del d[key] | |
- key existence check: | |
key in d | |
- get with default value: | |
d.get(key, default_if_key_unknown) | |
- all keys: | |
d.keys() | |
- iterate over keys: | |
for key in d: | |
d[key] | |
# keys are NOT ordered: | |
for key in dict(a=1, b=2, c=3, d=4): print key | |
- can be thought of as a finite function | |
set | |
- values are immutable! | |
.add | |
.union | |
.difference | |
.intersection | |
set vs dict | |
sorting: | |
- inplace | |
list.sort() | |
- new list | |
sorted(iterable) | |
File IO | |
reading: | |
f.read() | |
f.readlines() | |
for line in f: | |
... | |
writing: | |
f.write(what) | |
f.flush() | |
Predefined files | |
sys.stdin | |
sys.stdout | |
sys.stderr | |
FileSystem | |
os.listdir(dir) -> filenames | |
open(filename, mode) -> file | |
file.read() -> text | |
file.readlines() -> [lines] | |
file.write(what) | |
file.close() | |
file.close() | |
with open() as f: | |
f.read() | |
with open() as f: | |
f.write() | |
os.remove() | |
shutil.rmtree() | |
??? where to get filenames ??? | |
- scripts embed in the source | |
- tools | |
- from command line | |
sys.argv | |
- known configuration file name |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from random import shuffle | |
DOUBLE = (u'cs', u'dz', u'gy', u'ly', u'ny', u'sz', u'ty', u'zs') | |
TRIPLE = (u'dzs',) | |
def typoglicemia(text): | |
words = split_to_words(text) | |
typoglicemia_words = [word_typoglicemia(word) for word in words] | |
return u' '.join(typoglicemia_words) | |
def word_typoglicemia(word): | |
characters = convert_text_to_list(word) | |
typoglicemia_characters = list_typoglicemia(characters) | |
return u''.join(typoglicemia_characters) | |
def list_typoglicemia(characters): | |
# shorter lists need not be reshuffled | |
if len(characters) >= 4: | |
middle = characters[1:-1] | |
# 'shuffle' shuffles in place, does not return a value! | |
shuffle(middle) | |
return [characters[0]]+middle+[characters[-1]] | |
else: | |
return characters | |
def convert_text_to_list(text): | |
# never forget to stop a recursion | |
if text == u'': | |
return [] | |
# important: do triple before double before single | |
# so that 'dzs' does not become 'dz' + 's', or 'dz' to 'd' + 'z' | |
if text[0:3].lower() in TRIPLE: | |
return [text[0:3]] + convert_text_to_list(text[3:]) | |
if text[0:2].lower() in DOUBLE: | |
return [text[0:2]] + convert_text_to_list(text[2:]) | |
return [text[0]] + convert_text_to_list(text[1:]) | |
def split_to_words(text): | |
return text.split() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment