e3krisztian/class6.ipynb

## class6.ipynb
{
 "metadata": {
  "name": "",
  "signature": "sha256:3b36f2d6af5111b8c1ec8af58cc6a5a3e4fb34050d2c2ad4cf4f1d1a4236673f"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import os\n",
      "print os.getcwd()\n",
      "os.listdir('.')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "/home/kr/ceu/adatmesterseg\n"
       ]
      },
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 1,
       "text": [
        "['.ipynb_checkpoints',\n",
        " 'data',\n",
        " 'README.md',\n",
        " 'code',\n",
        " 'LICENSE',\n",
        " 'class6.ipynb',\n",
        " 'assignment',\n",
        " '.git']"
       ]
      }
     ],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "with open('data/private-cities.txt') as f:\n",
      "    lines = f.readlines()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# headlines 1\n",
      "[line for line in lines if len(line) < 80]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 3,
       "text": [
        "['Private Cities 101\\n',\n",
        " 'The 21st century will be the century of cities.\\n',\n",
        " 'Proprietary communities\\n',\n",
        " 'The problem\\n',\n",
        " 'Police and justice provision\\n',\n",
        " 'Institutional change\\n']"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# headlines 2\n",
      "[line for line in lines if '.' not in line]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 4,
       "text": [
        "['Private Cities 101\\n',\n",
        " 'Proprietary communities\\n',\n",
        " 'The problem\\n',\n",
        " 'Police and justice provision\\n',\n",
        " 'Institutional change\\n']"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# beolvasas soronkent mashogy\n",
      "with open('data/private-cities.txt') as f:\n",
      "    sections = [line for line in f if len(line) < 80]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "sections"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 6,
       "text": [
        "['Private Cities 101\\n',\n",
        " 'The 21st century will be the century of cities.\\n',\n",
        " 'Proprietary communities\\n',\n",
        " 'The problem\\n',\n",
        " 'Police and justice provision\\n',\n",
        " 'Institutional change\\n']"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# karakterek szama\n",
      "with open('data/private-cities.txt') as f:\n",
      "    print len(f.read())"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "15897\n"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# sorok szama\n",
      "with open('data/private-cities.txt') as f:\n",
      "    print len(f.readlines())"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "46\n"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# szavak szama\n",
      "with open('data/private-cities.txt') as f:\n",
      "    words = f.read().split()\n",
      "\n",
      "print len(words)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "2497\n"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# leghosszabb szo hossza\n",
      "max_length = max(len(w) for w in words)\n",
      "print max_length"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "17\n"
       ]
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# leghosszabb szo/szavak\n",
      "[word for word in words if len(word) == max_length]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 11,
       "text": [
        "['self-replication.']"
       ]
      }
     ],
     "prompt_number": 11
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# jo az elozo megoldas?\n",
      "long_words4 = [word for word in words if len(word) == max_length - 4]\n",
      "long_words4"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 12,
       "text": [
        "['Understanding',\n",
        " 'institutions,',\n",
        " 'Disentangling',\n",
        " 'institutional',\n",
        " 'intervention.',\n",
        " 'traditionally',\n",
        " 'developer\\xe2\\x80\\x99s',\n",
        " 'demonstrating',\n",
        " 'alternatives.',\n",
        " 'Schools\\xe2\\x80\\x94not',\n",
        " 'corresponding',\n",
        " 'international',\n",
        " 'international',\n",
        " 'Institutional',\n",
        " 'institutional',\n",
        " 'expropriation',\n",
        " 'institutional',\n",
        " 'privatization',\n",
        " 'independently',\n",
        " 'institutional',\n",
        " 'Institutional',\n",
        " 'concentrated.',\n",
        " 'concentrating',\n",
        " 'institutional',\n",
        " '\\xe2\\x80\\x9cAbrogating',\n",
        " 'institutional']"
       ]
      }
     ],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# ismetlodesek!\n",
      "set(long_words4)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 13,
       "text": [
        "{'Disentangling',\n",
        " 'Institutional',\n",
        " 'Schools\\xe2\\x80\\x94not',\n",
        " 'Understanding',\n",
        " 'alternatives.',\n",
        " 'concentrated.',\n",
        " 'concentrating',\n",
        " 'corresponding',\n",
        " 'demonstrating',\n",
        " 'developer\\xe2\\x80\\x99s',\n",
        " 'expropriation',\n",
        " 'independently',\n",
        " 'institutional',\n",
        " 'institutions,',\n",
        " 'international',\n",
        " 'intervention.',\n",
        " 'privatization',\n",
        " 'traditionally',\n",
        " '\\xe2\\x80\\x9cAbrogating'}"
       ]
      }
     ],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# leggyakoribb szavak\n",
      "word_count = {}\n",
      "for word in words:\n",
      "    if word in word_count:\n",
      "        previous_count = word_count[word]\n",
      "    else:\n",
      "        previous_count = 0\n",
      "    word_count[word] = previous_count + 1\n",
      "\n",
      "count_to_words = {}\n",
      "\n",
      "for word, count in word_count.items():\n",
      "    if count in count_to_words:\n",
      "        word_list = count_to_words[count]\n",
      "    else:\n",
      "        word_list = []\n",
      "        count_to_words[count] = word_list\n",
      "    word_list.append(word)\n",
      "\n",
      "highest_count = max(count_to_words)\n",
      "print highest_count, count_to_words[highest_count]\n",
      "high_counts = sorted(count_to_words, reverse=True)[:20]\n",
      "print high_counts\n",
      "for count in high_counts:\n",
      "    print count, count_to_words[count]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "129 ['the']\n",
        "[129, 95, 74, 56, 51, 41, 35, 30, 28, 27, 26, 22, 20, 19, 17, 15, 14, 13, 12, 11]\n",
        "129 ['the']\n",
        "95 ['of']\n",
        "74 ['to']\n",
        "56 ['a']\n",
        "51 ['is']\n",
        "41 ['and']\n",
        "35 ['are']\n",
        "30 ['in']\n",
        "28 ['that']\n",
        "27 ['cities']\n",
        "26 ['public']\n",
        "22 ['private']\n",
        "20 ['they']\n",
        "19 ['for']\n",
        "17 ['by', 'on']\n",
        "15 ['would']\n",
        "14 ['not', 'The', 'people']\n",
        "13 ['economic', 'as']\n",
        "12 ['more']\n",
        "11 ['change']\n"
       ]
      }
     ],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "with open('data/private-cities.txt') as f:\n",
      "    text = f.read()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "'!' in text"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 16,
       "text": [
        "False"
       ]
      }
     ],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "'?' in text"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 17,
       "text": [
        "True"
       ]
      }
     ],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "text.splitlines()[:5]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 18,
       "text": [
        "['Private Cities 101',\n",
        " 'The 21st century will be the century of cities.',\n",
        " 'Over the next 30 years, 1.8 billion people are expected to move to cities in developing countries. While some will add to existing cities, others will migrate to small towns, transforming them into the megapolises of tomorrow. Shenzhen, for example, was a small fishing village of 300,000 people in 1980. Since being designated a special economic zone that year, it has grown to over 10 million inhabitants.',\n",
        " 'Understanding the best form of city governance will be crucial to ensuring that the emigrants lead good lives. However, even as economics has moved to focus on institutions, the literature on cities has focused instead on policy outcomes, rent control, zoning, and public transportation.',\n",
        " 'The process of governance is important for two reasons. First, we cannot know what the ideal policy is. Constraints differ in time and place. Second, even with omniscient mayors knowing ideal policies, there is little reason to expect them to implement those ideal policies.']"
       ]
      }
     ],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print lines[5]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "So rather than focusing on outcomes, we should focus on how to achieve those outcomes. What conditions are necessary to produce the optimal amount of public goods in a city? Asking what is the ideal level of police, street sweepers, and garbage men is just as absurd as asking, \"what is the ideal amount of shoe production?\" We simply don\u2019t know. Markets constantly adjust between supply and demand, seeking this ideal level.\n",
        "\n"
       ]
      }
     ],
     "prompt_number": 19
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# sentences\n",
      "def sentences(lines):\n",
      "    separations=(\n",
      "        ('. ', '.'),\n",
      "        ('? ', '?'),\n",
      "    )\n",
      "    for separator, tail in separations:\n",
      "        sentences = []\n",
      "        for line in lines:\n",
      "            sentences += split_by_separator(line, separator, tail)\n",
      "        lines = sentences\n",
      "    return lines\n",
      "\n",
      "def split_by_separator(line, separator, tail):\n",
      "    fragments = line.split(separator)\n",
      "    sentences = [\n",
      "        s + tail\n",
      "        for s in fragments[:-1]\n",
      "    ] + fragments[-1:]\n",
      "    return sentences\n",
      "    \n",
      "sentences(lines)[:20]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 20,
       "text": [
        "['Private Cities 101\\n',\n",
        " 'The 21st century will be the century of cities.\\n',\n",
        " 'Over the next 30 years, 1.8 billion people are expected to move to cities in developing countries.',\n",
        " 'While some will add to existing cities, others will migrate to small towns, transforming them into the megapolises of tomorrow.',\n",
        " 'Shenzhen, for example, was a small fishing village of 300,000 people in 1980.',\n",
        " 'Since being designated a special economic zone that year, it has grown to over 10 million inhabitants.\\n',\n",
        " 'Understanding the best form of city governance will be crucial to ensuring that the emigrants lead good lives.',\n",
        " 'However, even as economics has moved to focus on institutions, the literature on cities has focused instead on policy outcomes, rent control, zoning, and public transportation.\\n',\n",
        " 'The process of governance is important for two reasons.',\n",
        " 'First, we cannot know what the ideal policy is.',\n",
        " 'Constraints differ in time and place.',\n",
        " 'Second, even with omniscient mayors knowing ideal policies, there is little reason to expect them to implement those ideal policies.\\n',\n",
        " 'So rather than focusing on outcomes, we should focus on how to achieve those outcomes.',\n",
        " 'What conditions are necessary to produce the optimal amount of public goods in a city?',\n",
        " 'Asking what is the ideal level of police, street sweepers, and garbage men is just as absurd as asking, \"what is the ideal amount of shoe production?\" We simply don\\xe2\\x80\\x99t know.',\n",
        " 'Markets constantly adjust between supply and demand, seeking this ideal level.\\n',\n",
        " 'Of course, cities are not like shoes.',\n",
        " 'They are far more complex.',\n",
        " 'Disentangling the marginal benefits of public transportation, the police, or garbage disposal is extremely difficult.',\n",
        " 'Further, as cities are spatially oriented, the application of the laws of economics differs from how we usually think of economic goods.']"
       ]
      }
     ],
     "prompt_number": 20
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Homework"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "- Reformat the text, so that there are two spaces before sections, one after section start, and one between paragraphs.\n",
      "- Report the number of sentences per section.\n",
      "- Write sections into file named *{two-digit-section-number}-{section-name}.txt* e.g. *01-Private-Cities-101.txt*"
     ]
    }
   ],
   "metadata": {}
  }
 ]
}

## class6.py

# coding: utf-8

# In[1]:

import os
print os.getcwd()
os.listdir('.')


# In[2]:

with open('data/private-cities.txt') as f:
    lines = f.readlines()


# In[3]:

# headlines 1
[line for line in lines if len(line) < 80]


# In[4]:

# headlines 2
[line for line in lines if '.' not in line]


# In[5]:

# beolvasas soronkent mashogy
with open('data/private-cities.txt') as f:
    sections = [line for line in f if len(line) < 80]


# In[6]:

sections


# In[7]:

# karakterek szama
with open('data/private-cities.txt') as f:
    print len(f.read())


# In[8]:

# sorok szama
with open('data/private-cities.txt') as f:
    print len(f.readlines())


# In[9]:

# szavak szama
with open('data/private-cities.txt') as f:
    words = f.read().split()

print len(words)


# In[10]:

# leghosszabb szo hossza
max_length = max(len(w) for w in words)
print max_length


# In[11]:

# leghosszabb szo/szavak
[word for word in words if len(word) == max_length]


# In[12]:

# jo az elozo megoldas?
long_words4 = [word for word in words if len(word) == max_length - 4]
long_words4


# In[13]:

# ismetlodesek!
set(long_words4)


# In[14]:

# leggyakoribb szavak
word_count = {}
for word in words:
    if word in word_count:
        previous_count = word_count[word]
    else:
        previous_count = 0
    word_count[word] = previous_count + 1

count_to_words = {}

for word, count in word_count.items():
    if count in count_to_words:
        word_list = count_to_words[count]
    else:
        word_list = []
        count_to_words[count] = word_list
    word_list.append(word)

highest_count = max(count_to_words)
print highest_count, count_to_words[highest_count]
high_counts = sorted(count_to_words, reverse=True)[:20]
print high_counts
for count in high_counts:
    print count, count_to_words[count]


# In[15]:

with open('data/private-cities.txt') as f:
    text = f.read()


# In[16]:

'!' in text


# In[17]:

'?' in text


# In[18]:

text.splitlines()[:5]


# In[19]:

print lines[5]


# In[20]:

# sentences
def sentences(lines):
    separations=(
        ('. ', '.'),
        ('? ', '?'),
    )
    for separator, tail in separations:
        sentences = []
        for line in lines:
            sentences += split_by_separator(line, separator, tail)
        lines = sentences
    return lines

def split_by_separator(line, separator, tail):
    fragments = line.split(separator)
    sentences = [
        s + tail
        for s in fragments[:-1]
    ] + fragments[-1:]
    return sentences

sentences(lines)[:20]


## Homework

# - Reformat the text, so that there are two spaces before sections, one after section start, and one between paragraphs.
# - Report the number of sentences per section.
# - Write sections into file named *{two-digit-section-number}-{section-name}.txt* e.g. *01-Private-Cities-101.txt*

## plan.txt
tools
	ipython
		- live environment
		- completion on names
		- help on names

	ipython notebook
		- ipython in browser
		- later in course

	http://pythontutor.com/visualize.html#mode=edit


dict (map, mapping)
	key -> value

	- keys are immutable - tuples vs lists
	- value - anything

	- create
		- empty: {}
		- {'a': 1, 1: 'a'}
		- dict(a=2, b=4)
		- dict(
				(
				(1, 2),
				(3, 4),
				(5, 6)
				)
			)
	- value access - normal indexing:
		d[key]
	- setting value:
		d[key] = value
	- deleting value:
		del d[key]
	- key existence check:
		key in d
	- get with default value:
		d.get(key, default_if_key_unknown)
	- all keys:
		d.keys()
	- iterate over keys:
		for key in d:
			d[key]
		# keys are NOT ordered:
		for key in dict(a=1, b=2, c=3, d=4): print key
	-  can be thought of as a finite function


set
	- values are immutable!

	.add
	.union
	.difference
	.intersection

set vs dict

sorting:
	- inplace
		list.sort()
	- new list
		sorted(iterable)

File IO
	reading:
		f.read()
		f.readlines()
		for line in f:
			...

	writing:
		f.write(what)
		f.flush()


Predefined files
	sys.stdin
	sys.stdout
	sys.stderr


FileSystem
	os.listdir(dir) -> filenames

	open(filename, mode) -> file
		file.read() -> text
		file.readlines() -> [lines]
		file.write(what)
		file.close()

	file.close()

	with open() as f:
		f.read()

	with open() as f:
		f.write()

	os.remove()
		shutil.rmtree()


??? where to get filenames ???
	- scripts embed in the source
	- tools
		- from command line
			sys.argv
		- known configuration file name

## typoglicemia.py
from random import shuffle

DOUBLE = (u'cs', u'dz', u'gy', u'ly', u'ny', u'sz', u'ty', u'zs')
TRIPLE = (u'dzs',)


def typoglicemia(text):
    words = split_to_words(text)
    typoglicemia_words = [word_typoglicemia(word) for word in words]
    return u' '.join(typoglicemia_words)


def word_typoglicemia(word):
    characters = convert_text_to_list(word)
    typoglicemia_characters = list_typoglicemia(characters)
    return u''.join(typoglicemia_characters)


def list_typoglicemia(characters):
    # shorter lists need not be reshuffled
    if len(characters) >= 4:
        middle = characters[1:-1]
        # 'shuffle' shuffles in place, does not return a value!
        shuffle(middle)
        return [characters[0]]+middle+[characters[-1]]
    else:
        return characters


def convert_text_to_list(text):
    # never forget to stop a recursion
    if text == u'':
        return []

    # important: do triple before double before single
    # so that 'dzs' does not become 'dz' + 's', or 'dz' to 'd' + 'z'
    if text[0:3].lower() in TRIPLE:
        return [text[0:3]] + convert_text_to_list(text[3:])
    if text[0:2].lower() in DOUBLE:
        return [text[0:2]] + convert_text_to_list(text[2:])
    return [text[0]] + convert_text_to_list(text[1:])


def split_to_words(text):
    return text.split()
	{
	"metadata": {
	"name": "",
	"signature": "sha256:3b36f2d6af5111b8c1ec8af58cc6a5a3e4fb34050d2c2ad4cf4f1d1a4236673f"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import os\n",
	"print os.getcwd()\n",
	"os.listdir('.')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"/home/kr/ceu/adatmesterseg\n"
	]
	},
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 1,
	"text": [
	"['.ipynb_checkpoints',\n",
	" 'data',\n",
	" 'README.md',\n",
	" 'code',\n",
	" 'LICENSE',\n",
	" 'class6.ipynb',\n",
	" 'assignment',\n",
	" '.git']"
	]
	}
	],
	"prompt_number": 1
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"with open('data/private-cities.txt') as f:\n",
	" lines = f.readlines()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# headlines 1\n",
	"[line for line in lines if len(line) < 80]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 3,
	"text": [
	"['Private Cities 101\\n',\n",
	" 'The 21st century will be the century of cities.\\n',\n",
	" 'Proprietary communities\\n',\n",
	" 'The problem\\n',\n",
	" 'Police and justice provision\\n',\n",
	" 'Institutional change\\n']"
	]
	}
	],
	"prompt_number": 3
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# headlines 2\n",
	"[line for line in lines if '.' not in line]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 4,
	"text": [
	"['Private Cities 101\\n',\n",
	" 'Proprietary communities\\n',\n",
	" 'The problem\\n',\n",
	" 'Police and justice provision\\n',\n",
	" 'Institutional change\\n']"
	]
	}
	],
	"prompt_number": 4
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# beolvasas soronkent mashogy\n",
	"with open('data/private-cities.txt') as f:\n",
	" sections = [line for line in f if len(line) < 80]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 5
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"sections"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 6,
	"text": [
	"['Private Cities 101\\n',\n",
	" 'The 21st century will be the century of cities.\\n',\n",
	" 'Proprietary communities\\n',\n",
	" 'The problem\\n',\n",
	" 'Police and justice provision\\n',\n",
	" 'Institutional change\\n']"
	]
	}
	],
	"prompt_number": 6
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# karakterek szama\n",
	"with open('data/private-cities.txt') as f:\n",
	" print len(f.read())"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"15897\n"
	]
	}
	],
	"prompt_number": 7
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# sorok szama\n",
	"with open('data/private-cities.txt') as f:\n",
	" print len(f.readlines())"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"46\n"
	]
	}
	],
	"prompt_number": 8
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# szavak szama\n",
	"with open('data/private-cities.txt') as f:\n",
	" words = f.read().split()\n",
	"\n",
	"print len(words)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"2497\n"
	]
	}
	],
	"prompt_number": 9
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# leghosszabb szo hossza\n",
	"max_length = max(len(w) for w in words)\n",
	"print max_length"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"17\n"
	]
	}
	],
	"prompt_number": 10
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# leghosszabb szo/szavak\n",
	"[word for word in words if len(word) == max_length]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 11,
	"text": [
	"['self-replication.']"
	]
	}
	],
	"prompt_number": 11
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# jo az elozo megoldas?\n",
	"long_words4 = [word for word in words if len(word) == max_length - 4]\n",
	"long_words4"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 12,
	"text": [
	"['Understanding',\n",
	" 'institutions,',\n",
	" 'Disentangling',\n",
	" 'institutional',\n",
	" 'intervention.',\n",
	" 'traditionally',\n",
	" 'developer\\xe2\\x80\\x99s',\n",
	" 'demonstrating',\n",
	" 'alternatives.',\n",
	" 'Schools\\xe2\\x80\\x94not',\n",
	" 'corresponding',\n",
	" 'international',\n",
	" 'international',\n",
	" 'Institutional',\n",
	" 'institutional',\n",
	" 'expropriation',\n",
	" 'institutional',\n",
	" 'privatization',\n",
	" 'independently',\n",
	" 'institutional',\n",
	" 'Institutional',\n",
	" 'concentrated.',\n",
	" 'concentrating',\n",
	" 'institutional',\n",
	" '\\xe2\\x80\\x9cAbrogating',\n",
	" 'institutional']"
	]
	}
	],
	"prompt_number": 12
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# ismetlodesek!\n",
	"set(long_words4)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 13,
	"text": [
	"{'Disentangling',\n",
	" 'Institutional',\n",
	" 'Schools\\xe2\\x80\\x94not',\n",
	" 'Understanding',\n",
	" 'alternatives.',\n",
	" 'concentrated.',\n",
	" 'concentrating',\n",
	" 'corresponding',\n",
	" 'demonstrating',\n",
	" 'developer\\xe2\\x80\\x99s',\n",
	" 'expropriation',\n",
	" 'independently',\n",
	" 'institutional',\n",
	" 'institutions,',\n",
	" 'international',\n",
	" 'intervention.',\n",
	" 'privatization',\n",
	" 'traditionally',\n",
	" '\\xe2\\x80\\x9cAbrogating'}"
	]
	}
	],
	"prompt_number": 13
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# leggyakoribb szavak\n",
	"word_count = {}\n",
	"for word in words:\n",
	" if word in word_count:\n",
	" previous_count = word_count[word]\n",
	" else:\n",
	" previous_count = 0\n",
	" word_count[word] = previous_count + 1\n",
	"\n",
	"count_to_words = {}\n",
	"\n",
	"for word, count in word_count.items():\n",
	" if count in count_to_words:\n",
	" word_list = count_to_words[count]\n",
	" else:\n",
	" word_list = []\n",
	" count_to_words[count] = word_list\n",
	" word_list.append(word)\n",
	"\n",
	"highest_count = max(count_to_words)\n",
	"print highest_count, count_to_words[highest_count]\n",
	"high_counts = sorted(count_to_words, reverse=True)[:20]\n",
	"print high_counts\n",
	"for count in high_counts:\n",
	" print count, count_to_words[count]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"129 ['the']\n",
	"[129, 95, 74, 56, 51, 41, 35, 30, 28, 27, 26, 22, 20, 19, 17, 15, 14, 13, 12, 11]\n",
	"129 ['the']\n",
	"95 ['of']\n",
	"74 ['to']\n",
	"56 ['a']\n",
	"51 ['is']\n",
	"41 ['and']\n",
	"35 ['are']\n",
	"30 ['in']\n",
	"28 ['that']\n",
	"27 ['cities']\n",
	"26 ['public']\n",
	"22 ['private']\n",
	"20 ['they']\n",
	"19 ['for']\n",
	"17 ['by', 'on']\n",
	"15 ['would']\n",
	"14 ['not', 'The', 'people']\n",
	"13 ['economic', 'as']\n",
	"12 ['more']\n",
	"11 ['change']\n"
	]
	}
	],
	"prompt_number": 14
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"with open('data/private-cities.txt') as f:\n",
	" text = f.read()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 15
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"'!' in text"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 16,
	"text": [
	"False"
	]
	}
	],
	"prompt_number": 16
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"'?' in text"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 17,
	"text": [
	"True"
	]
	}
	],
	"prompt_number": 17
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"text.splitlines()[:5]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 18,
	"text": [
	"['Private Cities 101',\n",
	" 'The 21st century will be the century of cities.',\n",
	" 'Over the next 30 years, 1.8 billion people are expected to move to cities in developing countries. While some will add to existing cities, others will migrate to small towns, transforming them into the megapolises of tomorrow. Shenzhen, for example, was a small fishing village of 300,000 people in 1980. Since being designated a special economic zone that year, it has grown to over 10 million inhabitants.',\n",
	" 'Understanding the best form of city governance will be crucial to ensuring that the emigrants lead good lives. However, even as economics has moved to focus on institutions, the literature on cities has focused instead on policy outcomes, rent control, zoning, and public transportation.',\n",
	" 'The process of governance is important for two reasons. First, we cannot know what the ideal policy is. Constraints differ in time and place. Second, even with omniscient mayors knowing ideal policies, there is little reason to expect them to implement those ideal policies.']"
	]
	}
	],
	"prompt_number": 18
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"print lines[5]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"So rather than focusing on outcomes, we should focus on how to achieve those outcomes. What conditions are necessary to produce the optimal amount of public goods in a city? Asking what is the ideal level of police, street sweepers, and garbage men is just as absurd as asking, \"what is the ideal amount of shoe production?\" We simply don\u2019t know. Markets constantly adjust between supply and demand, seeking this ideal level.\n",
	"\n"
	]
	}
	],
	"prompt_number": 19
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# sentences\n",
	"def sentences(lines):\n",
	" separations=(\n",
	" ('. ', '.'),\n",
	" ('? ', '?'),\n",
	" )\n",
	" for separator, tail in separations:\n",
	" sentences = []\n",
	" for line in lines:\n",
	" sentences += split_by_separator(line, separator, tail)\n",
	" lines = sentences\n",
	" return lines\n",
	"\n",
	"def split_by_separator(line, separator, tail):\n",
	" fragments = line.split(separator)\n",
	" sentences = [\n",
	" s + tail\n",
	" for s in fragments[:-1]\n",
	" ] + fragments[-1:]\n",
	" return sentences\n",
	" \n",
	"sentences(lines)[:20]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 20,
	"text": [
	"['Private Cities 101\\n',\n",
	" 'The 21st century will be the century of cities.\\n',\n",
	" 'Over the next 30 years, 1.8 billion people are expected to move to cities in developing countries.',\n",
	" 'While some will add to existing cities, others will migrate to small towns, transforming them into the megapolises of tomorrow.',\n",
	" 'Shenzhen, for example, was a small fishing village of 300,000 people in 1980.',\n",
	" 'Since being designated a special economic zone that year, it has grown to over 10 million inhabitants.\\n',\n",
	" 'Understanding the best form of city governance will be crucial to ensuring that the emigrants lead good lives.',\n",
	" 'However, even as economics has moved to focus on institutions, the literature on cities has focused instead on policy outcomes, rent control, zoning, and public transportation.\\n',\n",
	" 'The process of governance is important for two reasons.',\n",
	" 'First, we cannot know what the ideal policy is.',\n",
	" 'Constraints differ in time and place.',\n",
	" 'Second, even with omniscient mayors knowing ideal policies, there is little reason to expect them to implement those ideal policies.\\n',\n",
	" 'So rather than focusing on outcomes, we should focus on how to achieve those outcomes.',\n",
	" 'What conditions are necessary to produce the optimal amount of public goods in a city?',\n",
	" 'Asking what is the ideal level of police, street sweepers, and garbage men is just as absurd as asking, \"what is the ideal amount of shoe production?\" We simply don\\xe2\\x80\\x99t know.',\n",
	" 'Markets constantly adjust between supply and demand, seeking this ideal level.\\n',\n",
	" 'Of course, cities are not like shoes.',\n",
	" 'They are far more complex.',\n",
	" 'Disentangling the marginal benefits of public transportation, the police, or garbage disposal is extremely difficult.',\n",
	" 'Further, as cities are spatially oriented, the application of the laws of economics differs from how we usually think of economic goods.']"
	]
	}
	],
	"prompt_number": 20
	},
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"Homework"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"- Reformat the text, so that there are two spaces before sections, one after section start, and one between paragraphs.\n",
	"- Report the number of sentences per section.\n",
	"- Write sections into file named {two-digit-section-number}-{section-name}.txt e.g. 01-Private-Cities-101.txt"
	]
	}
	],
	"metadata": {}
	}
	]
	}

	# coding: utf-8

	# In[1]:

	import os
	print os.getcwd()
	os.listdir('.')


	# In[2]:

	with open('data/private-cities.txt') as f:
	lines = f.readlines()


	# In[3]:

	# headlines 1
	[line for line in lines if len(line) < 80]


	# In[4]:

	# headlines 2
	[line for line in lines if '.' not in line]


	# In[5]:

	# beolvasas soronkent mashogy
	with open('data/private-cities.txt') as f:
	sections = [line for line in f if len(line) < 80]


	# In[6]:

	sections


	# In[7]:

	# karakterek szama
	with open('data/private-cities.txt') as f:
	print len(f.read())


	# In[8]:

	# sorok szama
	with open('data/private-cities.txt') as f:
	print len(f.readlines())


	# In[9]:

	# szavak szama
	with open('data/private-cities.txt') as f:
	words = f.read().split()

	print len(words)


	# In[10]:

	# leghosszabb szo hossza
	max_length = max(len(w) for w in words)
	print max_length


	# In[11]:

	# leghosszabb szo/szavak
	[word for word in words if len(word) == max_length]


	# In[12]:

	# jo az elozo megoldas?
	long_words4 = [word for word in words if len(word) == max_length - 4]
	long_words4


	# In[13]:

	# ismetlodesek!
	set(long_words4)


	# In[14]:

	# leggyakoribb szavak
	word_count = {}
	for word in words:
	if word in word_count:
	previous_count = word_count[word]
	else:
	previous_count = 0
	word_count[word] = previous_count + 1

	count_to_words = {}

	for word, count in word_count.items():
	if count in count_to_words:
	word_list = count_to_words[count]
	else:
	word_list = []
	count_to_words[count] = word_list
	word_list.append(word)

	highest_count = max(count_to_words)
	print highest_count, count_to_words[highest_count]
	high_counts = sorted(count_to_words, reverse=True)[:20]
	print high_counts
	for count in high_counts:
	print count, count_to_words[count]


	# In[15]:

	with open('data/private-cities.txt') as f:
	text = f.read()


	# In[16]:

	'!' in text


	# In[17]:

	'?' in text


	# In[18]:

	text.splitlines()[:5]


	# In[19]:

	print lines[5]


	# In[20]:

	# sentences
	def sentences(lines):
	separations=(
	('. ', '.'),
	('? ', '?'),
	)
	for separator, tail in separations:
	sentences = []
	for line in lines:
	sentences += split_by_separator(line, separator, tail)
	lines = sentences
	return lines

	def split_by_separator(line, separator, tail):
	fragments = line.split(separator)
	sentences = [
	s + tail
	for s in fragments[:-1]
	] + fragments[-1:]
	return sentences

	sentences(lines)[:20]


	## Homework

	# - Reformat the text, so that there are two spaces before sections, one after section start, and one between paragraphs.
	# - Report the number of sentences per section.
	# - Write sections into file named {two-digit-section-number}-{section-name}.txt e.g. 01-Private-Cities-101.txt
	tools
	ipython
	- live environment
	- completion on names
	- help on names

	ipython notebook
	- ipython in browser
	- later in course

	http://pythontutor.com/visualize.html#mode=edit


	dict (map, mapping)
	key -> value

	- keys are immutable - tuples vs lists
	- value - anything

	- create
	- empty: {}
	- {'a': 1, 1: 'a'}
	- dict(a=2, b=4)
	- dict(
	(
	(1, 2),
	(3, 4),
	(5, 6)
	)
	)
	- value access - normal indexing:
	d[key]
	- setting value:
	d[key] = value
	- deleting value:
	del d[key]
	- key existence check:
	key in d
	- get with default value:
	d.get(key, default_if_key_unknown)
	- all keys:
	d.keys()
	- iterate over keys:
	for key in d:
	d[key]
	# keys are NOT ordered:
	for key in dict(a=1, b=2, c=3, d=4): print key
	- can be thought of as a finite function


	set
	- values are immutable!

	.add
	.union
	.difference
	.intersection

	set vs dict

	sorting:
	- inplace
	list.sort()
	- new list
	sorted(iterable)

	File IO
	reading:
	f.read()
	f.readlines()
	for line in f:
	...

	writing:
	f.write(what)
	f.flush()


	Predefined files
	sys.stdin
	sys.stdout
	sys.stderr


	FileSystem
	os.listdir(dir) -> filenames

	open(filename, mode) -> file
	file.read() -> text
	file.readlines() -> [lines]
	file.write(what)
	file.close()

	file.close()

	with open() as f:
	f.read()

	with open() as f:
	f.write()

	os.remove()
	shutil.rmtree()


	??? where to get filenames ???
	- scripts embed in the source
	- tools
	- from command line
	sys.argv
	- known configuration file name
	from random import shuffle

	DOUBLE = (u'cs', u'dz', u'gy', u'ly', u'ny', u'sz', u'ty', u'zs')
	TRIPLE = (u'dzs',)


	def typoglicemia(text):
	words = split_to_words(text)
	typoglicemia_words = [word_typoglicemia(word) for word in words]
	return u' '.join(typoglicemia_words)


	def word_typoglicemia(word):
	characters = convert_text_to_list(word)
	typoglicemia_characters = list_typoglicemia(characters)
	return u''.join(typoglicemia_characters)


	def list_typoglicemia(characters):
	# shorter lists need not be reshuffled
	if len(characters) >= 4:
	middle = characters[1:-1]
	# 'shuffle' shuffles in place, does not return a value!
	shuffle(middle)
	return [characters[0]]+middle+[characters[-1]]
	else:
	return characters


	def convert_text_to_list(text):
	# never forget to stop a recursion
	if text == u'':
	return []

	# important: do triple before double before single
	# so that 'dzs' does not become 'dz' + 's', or 'dz' to 'd' + 'z'
	if text[0:3].lower() in TRIPLE:
	return [text[0:3]] + convert_text_to_list(text[3:])
	if text[0:2].lower() in DOUBLE:
	return [text[0:2]] + convert_text_to_list(text[2:])
	return [text[0]] + convert_text_to_list(text[1:])


	def split_to_words(text):
	return text.split()