Skip to content

Instantly share code, notes, and snippets.

@e3krisztian
Last active August 29, 2015 14:17
Show Gist options
  • Save e3krisztian/458e486d5d937246906e to your computer and use it in GitHub Desktop.
Save e3krisztian/458e486d5d937246906e to your computer and use it in GitHub Desktop.
amc6
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:3b36f2d6af5111b8c1ec8af58cc6a5a3e4fb34050d2c2ad4cf4f1d1a4236673f"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import os\n",
"print os.getcwd()\n",
"os.listdir('.')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"/home/kr/ceu/adatmesterseg\n"
]
},
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 1,
"text": [
"['.ipynb_checkpoints',\n",
" 'data',\n",
" 'README.md',\n",
" 'code',\n",
" 'LICENSE',\n",
" 'class6.ipynb',\n",
" 'assignment',\n",
" '.git']"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with open('data/private-cities.txt') as f:\n",
" lines = f.readlines()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# headlines 1\n",
"[line for line in lines if len(line) < 80]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
"['Private Cities 101\\n',\n",
" 'The 21st century will be the century of cities.\\n',\n",
" 'Proprietary communities\\n',\n",
" 'The problem\\n',\n",
" 'Police and justice provision\\n',\n",
" 'Institutional change\\n']"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# headlines 2\n",
"[line for line in lines if '.' not in line]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"['Private Cities 101\\n',\n",
" 'Proprietary communities\\n',\n",
" 'The problem\\n',\n",
" 'Police and justice provision\\n',\n",
" 'Institutional change\\n']"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# beolvasas soronkent mashogy\n",
"with open('data/private-cities.txt') as f:\n",
" sections = [line for line in f if len(line) < 80]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"sections"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"['Private Cities 101\\n',\n",
" 'The 21st century will be the century of cities.\\n',\n",
" 'Proprietary communities\\n',\n",
" 'The problem\\n',\n",
" 'Police and justice provision\\n',\n",
" 'Institutional change\\n']"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# karakterek szama\n",
"with open('data/private-cities.txt') as f:\n",
" print len(f.read())"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"15897\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# sorok szama\n",
"with open('data/private-cities.txt') as f:\n",
" print len(f.readlines())"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"46\n"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# szavak szama\n",
"with open('data/private-cities.txt') as f:\n",
" words = f.read().split()\n",
"\n",
"print len(words)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"2497\n"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# leghosszabb szo hossza\n",
"max_length = max(len(w) for w in words)\n",
"print max_length"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"17\n"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# leghosszabb szo/szavak\n",
"[word for word in words if len(word) == max_length]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 11,
"text": [
"['self-replication.']"
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# jo az elozo megoldas?\n",
"long_words4 = [word for word in words if len(word) == max_length - 4]\n",
"long_words4"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 12,
"text": [
"['Understanding',\n",
" 'institutions,',\n",
" 'Disentangling',\n",
" 'institutional',\n",
" 'intervention.',\n",
" 'traditionally',\n",
" 'developer\\xe2\\x80\\x99s',\n",
" 'demonstrating',\n",
" 'alternatives.',\n",
" 'Schools\\xe2\\x80\\x94not',\n",
" 'corresponding',\n",
" 'international',\n",
" 'international',\n",
" 'Institutional',\n",
" 'institutional',\n",
" 'expropriation',\n",
" 'institutional',\n",
" 'privatization',\n",
" 'independently',\n",
" 'institutional',\n",
" 'Institutional',\n",
" 'concentrated.',\n",
" 'concentrating',\n",
" 'institutional',\n",
" '\\xe2\\x80\\x9cAbrogating',\n",
" 'institutional']"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# ismetlodesek!\n",
"set(long_words4)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 13,
"text": [
"{'Disentangling',\n",
" 'Institutional',\n",
" 'Schools\\xe2\\x80\\x94not',\n",
" 'Understanding',\n",
" 'alternatives.',\n",
" 'concentrated.',\n",
" 'concentrating',\n",
" 'corresponding',\n",
" 'demonstrating',\n",
" 'developer\\xe2\\x80\\x99s',\n",
" 'expropriation',\n",
" 'independently',\n",
" 'institutional',\n",
" 'institutions,',\n",
" 'international',\n",
" 'intervention.',\n",
" 'privatization',\n",
" 'traditionally',\n",
" '\\xe2\\x80\\x9cAbrogating'}"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# leggyakoribb szavak\n",
"word_count = {}\n",
"for word in words:\n",
" if word in word_count:\n",
" previous_count = word_count[word]\n",
" else:\n",
" previous_count = 0\n",
" word_count[word] = previous_count + 1\n",
"\n",
"count_to_words = {}\n",
"\n",
"for word, count in word_count.items():\n",
" if count in count_to_words:\n",
" word_list = count_to_words[count]\n",
" else:\n",
" word_list = []\n",
" count_to_words[count] = word_list\n",
" word_list.append(word)\n",
"\n",
"highest_count = max(count_to_words)\n",
"print highest_count, count_to_words[highest_count]\n",
"high_counts = sorted(count_to_words, reverse=True)[:20]\n",
"print high_counts\n",
"for count in high_counts:\n",
" print count, count_to_words[count]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"129 ['the']\n",
"[129, 95, 74, 56, 51, 41, 35, 30, 28, 27, 26, 22, 20, 19, 17, 15, 14, 13, 12, 11]\n",
"129 ['the']\n",
"95 ['of']\n",
"74 ['to']\n",
"56 ['a']\n",
"51 ['is']\n",
"41 ['and']\n",
"35 ['are']\n",
"30 ['in']\n",
"28 ['that']\n",
"27 ['cities']\n",
"26 ['public']\n",
"22 ['private']\n",
"20 ['they']\n",
"19 ['for']\n",
"17 ['by', 'on']\n",
"15 ['would']\n",
"14 ['not', 'The', 'people']\n",
"13 ['economic', 'as']\n",
"12 ['more']\n",
"11 ['change']\n"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with open('data/private-cities.txt') as f:\n",
" text = f.read()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"'!' in text"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 16,
"text": [
"False"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"'?' in text"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 17,
"text": [
"True"
]
}
],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"text.splitlines()[:5]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 18,
"text": [
"['Private Cities 101',\n",
" 'The 21st century will be the century of cities.',\n",
" 'Over the next 30 years, 1.8 billion people are expected to move to cities in developing countries. While some will add to existing cities, others will migrate to small towns, transforming them into the megapolises of tomorrow. Shenzhen, for example, was a small fishing village of 300,000 people in 1980. Since being designated a special economic zone that year, it has grown to over 10 million inhabitants.',\n",
" 'Understanding the best form of city governance will be crucial to ensuring that the emigrants lead good lives. However, even as economics has moved to focus on institutions, the literature on cities has focused instead on policy outcomes, rent control, zoning, and public transportation.',\n",
" 'The process of governance is important for two reasons. First, we cannot know what the ideal policy is. Constraints differ in time and place. Second, even with omniscient mayors knowing ideal policies, there is little reason to expect them to implement those ideal policies.']"
]
}
],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print lines[5]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"So rather than focusing on outcomes, we should focus on how to achieve those outcomes. What conditions are necessary to produce the optimal amount of public goods in a city? Asking what is the ideal level of police, street sweepers, and garbage men is just as absurd as asking, \"what is the ideal amount of shoe production?\" We simply don\u2019t know. Markets constantly adjust between supply and demand, seeking this ideal level.\n",
"\n"
]
}
],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# sentences\n",
"def sentences(lines):\n",
" separations=(\n",
" ('. ', '.'),\n",
" ('? ', '?'),\n",
" )\n",
" for separator, tail in separations:\n",
" sentences = []\n",
" for line in lines:\n",
" sentences += split_by_separator(line, separator, tail)\n",
" lines = sentences\n",
" return lines\n",
"\n",
"def split_by_separator(line, separator, tail):\n",
" fragments = line.split(separator)\n",
" sentences = [\n",
" s + tail\n",
" for s in fragments[:-1]\n",
" ] + fragments[-1:]\n",
" return sentences\n",
" \n",
"sentences(lines)[:20]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 20,
"text": [
"['Private Cities 101\\n',\n",
" 'The 21st century will be the century of cities.\\n',\n",
" 'Over the next 30 years, 1.8 billion people are expected to move to cities in developing countries.',\n",
" 'While some will add to existing cities, others will migrate to small towns, transforming them into the megapolises of tomorrow.',\n",
" 'Shenzhen, for example, was a small fishing village of 300,000 people in 1980.',\n",
" 'Since being designated a special economic zone that year, it has grown to over 10 million inhabitants.\\n',\n",
" 'Understanding the best form of city governance will be crucial to ensuring that the emigrants lead good lives.',\n",
" 'However, even as economics has moved to focus on institutions, the literature on cities has focused instead on policy outcomes, rent control, zoning, and public transportation.\\n',\n",
" 'The process of governance is important for two reasons.',\n",
" 'First, we cannot know what the ideal policy is.',\n",
" 'Constraints differ in time and place.',\n",
" 'Second, even with omniscient mayors knowing ideal policies, there is little reason to expect them to implement those ideal policies.\\n',\n",
" 'So rather than focusing on outcomes, we should focus on how to achieve those outcomes.',\n",
" 'What conditions are necessary to produce the optimal amount of public goods in a city?',\n",
" 'Asking what is the ideal level of police, street sweepers, and garbage men is just as absurd as asking, \"what is the ideal amount of shoe production?\" We simply don\\xe2\\x80\\x99t know.',\n",
" 'Markets constantly adjust between supply and demand, seeking this ideal level.\\n',\n",
" 'Of course, cities are not like shoes.',\n",
" 'They are far more complex.',\n",
" 'Disentangling the marginal benefits of public transportation, the police, or garbage disposal is extremely difficult.',\n",
" 'Further, as cities are spatially oriented, the application of the laws of economics differs from how we usually think of economic goods.']"
]
}
],
"prompt_number": 20
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Homework"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- Reformat the text, so that there are two spaces before sections, one after section start, and one between paragraphs.\n",
"- Report the number of sentences per section.\n",
"- Write sections into file named *{two-digit-section-number}-{section-name}.txt* e.g. *01-Private-Cities-101.txt*"
]
}
],
"metadata": {}
}
]
}
# coding: utf-8
# In[1]:
import os
print os.getcwd()
os.listdir('.')
# In[2]:
with open('data/private-cities.txt') as f:
lines = f.readlines()
# In[3]:
# headlines 1
[line for line in lines if len(line) < 80]
# In[4]:
# headlines 2
[line for line in lines if '.' not in line]
# In[5]:
# beolvasas soronkent mashogy
with open('data/private-cities.txt') as f:
sections = [line for line in f if len(line) < 80]
# In[6]:
sections
# In[7]:
# karakterek szama
with open('data/private-cities.txt') as f:
print len(f.read())
# In[8]:
# sorok szama
with open('data/private-cities.txt') as f:
print len(f.readlines())
# In[9]:
# szavak szama
with open('data/private-cities.txt') as f:
words = f.read().split()
print len(words)
# In[10]:
# leghosszabb szo hossza
max_length = max(len(w) for w in words)
print max_length
# In[11]:
# leghosszabb szo/szavak
[word for word in words if len(word) == max_length]
# In[12]:
# jo az elozo megoldas?
long_words4 = [word for word in words if len(word) == max_length - 4]
long_words4
# In[13]:
# ismetlodesek!
set(long_words4)
# In[14]:
# leggyakoribb szavak
word_count = {}
for word in words:
if word in word_count:
previous_count = word_count[word]
else:
previous_count = 0
word_count[word] = previous_count + 1
count_to_words = {}
for word, count in word_count.items():
if count in count_to_words:
word_list = count_to_words[count]
else:
word_list = []
count_to_words[count] = word_list
word_list.append(word)
highest_count = max(count_to_words)
print highest_count, count_to_words[highest_count]
high_counts = sorted(count_to_words, reverse=True)[:20]
print high_counts
for count in high_counts:
print count, count_to_words[count]
# In[15]:
with open('data/private-cities.txt') as f:
text = f.read()
# In[16]:
'!' in text
# In[17]:
'?' in text
# In[18]:
text.splitlines()[:5]
# In[19]:
print lines[5]
# In[20]:
# sentences
def sentences(lines):
separations=(
('. ', '.'),
('? ', '?'),
)
for separator, tail in separations:
sentences = []
for line in lines:
sentences += split_by_separator(line, separator, tail)
lines = sentences
return lines
def split_by_separator(line, separator, tail):
fragments = line.split(separator)
sentences = [
s + tail
for s in fragments[:-1]
] + fragments[-1:]
return sentences
sentences(lines)[:20]
## Homework
# - Reformat the text, so that there are two spaces before sections, one after section start, and one between paragraphs.
# - Report the number of sentences per section.
# - Write sections into file named *{two-digit-section-number}-{section-name}.txt* e.g. *01-Private-Cities-101.txt*
tools
ipython
- live environment
- completion on names
- help on names
ipython notebook
- ipython in browser
- later in course
http://pythontutor.com/visualize.html#mode=edit
dict (map, mapping)
key -> value
- keys are immutable - tuples vs lists
- value - anything
- create
- empty: {}
- {'a': 1, 1: 'a'}
- dict(a=2, b=4)
- dict(
(
(1, 2),
(3, 4),
(5, 6)
)
)
- value access - normal indexing:
d[key]
- setting value:
d[key] = value
- deleting value:
del d[key]
- key existence check:
key in d
- get with default value:
d.get(key, default_if_key_unknown)
- all keys:
d.keys()
- iterate over keys:
for key in d:
d[key]
# keys are NOT ordered:
for key in dict(a=1, b=2, c=3, d=4): print key
- can be thought of as a finite function
set
- values are immutable!
.add
.union
.difference
.intersection
set vs dict
sorting:
- inplace
list.sort()
- new list
sorted(iterable)
File IO
reading:
f.read()
f.readlines()
for line in f:
...
writing:
f.write(what)
f.flush()
Predefined files
sys.stdin
sys.stdout
sys.stderr
FileSystem
os.listdir(dir) -> filenames
open(filename, mode) -> file
file.read() -> text
file.readlines() -> [lines]
file.write(what)
file.close()
file.close()
with open() as f:
f.read()
with open() as f:
f.write()
os.remove()
shutil.rmtree()
??? where to get filenames ???
- scripts embed in the source
- tools
- from command line
sys.argv
- known configuration file name
from random import shuffle
DOUBLE = (u'cs', u'dz', u'gy', u'ly', u'ny', u'sz', u'ty', u'zs')
TRIPLE = (u'dzs',)
def typoglicemia(text):
words = split_to_words(text)
typoglicemia_words = [word_typoglicemia(word) for word in words]
return u' '.join(typoglicemia_words)
def word_typoglicemia(word):
characters = convert_text_to_list(word)
typoglicemia_characters = list_typoglicemia(characters)
return u''.join(typoglicemia_characters)
def list_typoglicemia(characters):
# shorter lists need not be reshuffled
if len(characters) >= 4:
middle = characters[1:-1]
# 'shuffle' shuffles in place, does not return a value!
shuffle(middle)
return [characters[0]]+middle+[characters[-1]]
else:
return characters
def convert_text_to_list(text):
# never forget to stop a recursion
if text == u'':
return []
# important: do triple before double before single
# so that 'dzs' does not become 'dz' + 's', or 'dz' to 'd' + 'z'
if text[0:3].lower() in TRIPLE:
return [text[0:3]] + convert_text_to_list(text[3:])
if text[0:2].lower() in DOUBLE:
return [text[0:2]] + convert_text_to_list(text[2:])
return [text[0]] + convert_text_to_list(text[1:])
def split_to_words(text):
return text.split()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment