SuzanaK/Python_Code_Snippets

## csv2libsvm.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# converts csv to libsvm format
# usage: python convert_to_libsvm.py (after having modified input and output filename)

import codecs


# change input filename here
filename = "features.csv"
fh = codecs.open(filename, "r", "utf-8")
lines = fh.readlines()
fh.close()

# change output filename here
out = "features.libsvm"
fh = codecs.open(out, "w", "utf-8")


for l in lines:

l = l.split(";")
lineout = l[0] + " "
values = l[1:-2]
for i in range(len(values)):

# for some unknown reason libsvm starts index counting at 1, not at 0
lineout += str(i+1) + ":" + values[i] + " "

#lineout += " -1:\n"
lineout += "\n"
fh.write(lineout)


fh.close()

## deutschewelle.py
#download the free audio trainer German / French from Deutsche Welle

import urllib

for i in range(1, 10):
  urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion00' +str(i)+'_dwdownload.mp3', '/home/me/filename_' + str(i) + '.mp3')

for i in range(10, 100):
    urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion0' +str(i)+'_dwdownload.mp3', '/home/me/filename_' + str(i) + '.mp3')

urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion100_dwdownload.mp3', '/home/me/filename_100.mp3')


## google_search.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
import urllib


# performs a Google search with the title and returns the first URL's of the query result
def search_web(title):

    words = "+".join(title.split())
    query = urllib.urlencode({'q':words})
    # the ajax google api only returns the first results but that is enough for our purpose
    response = urllib.urlopen('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
    json_response = json.loads(response)
    if json_response and json_response['responseData']:
    	results = json_response['responseData']['results']
    else:
        return []
    urls = [result['url'] for result in results]
    print('Found ' + str(len(urls)) + ' URLs.')
    return urls

## language_codes.py
{'Estonian': 'et', 'Bicol': 'bcl', 'Zande': 'zne', 'Uruund': 'rnd', 'Khoekhoegowab': 'naq', u'Newari': 'new', 'Krio': 'kri', 'Nuer': 'nus', 'Cinyanja': 'nya', u'Bulgarian': 'bg', 'Norwegian': 'no', u'Yoruba': 'yo', u'French': 'fr', u'Otomi': 'oto', 'Ateso': 'teo', 'Iloko': 'ilo', 'Wolaita': 'wal', 'Tsonga': 'ts', 'Tzotzil': 'tzo', u'Tamil': 'ta', u'Haitian': 'ht', 'Samoan': 'sm', 'Aukan': 'djk', 'Finnish': 'fi', 'Rutoro': 'ttj', 'Dangme': 'ada', 'Albanian': 'sq', 'Mbunda': 'mck', 'Solomon': 'pis', 'Hiligaynon': 'hil', 'Tagalog': 'tl', u'Serbian': 'sr_latn', u'Efik': 'efi', 'Pangasinan': 'pag', 'Italian': 'it', 'Miskito': 'miq', 'Lhukonzo': 'koo', 'Lamba': 'lam', u'Kongo': 'kg', 'Mazatec': 'mau', u'Tarascan': 'tsz', u'Amharic': 'am', u'Czech': 'cs', u'Papiamento': 'pap', u'Nahuatl': 'ncj', 'Ga': 'gaa', 'Polish': 'pl', 'Tongan': 'to', 'Xhosa': 'xh', 'Swedish': 'sv', u'Marathi': 'mr', 'Luganda': 'lg', u'Slovenian': 'sl', 'Ewe': 'ee', u'Azerbaijani': 'az_cyrl', u'Kikuyu': 'ki', 'Luo': 'luo', 'Tankarana': 'xmv', 'Danish': 'da', 'Indonesian': 'id', 'Frafra': 'gur', 'Zulu': 'zu', 'Lenje': 'leh', 'Cakchiquel': 'cak', u'Georgian': 'ka', 'Mayangna': 'yan', 'Tetum': 'tdt', u'Tigrinya': 'ti', 'Nzema': 'nzi', 'Niuean': 'niu', u'Slovak': 'sk', u'Thai': 'th', 'Afrikaans': 'af', u'Lahu': 'lhu', u'Guarani': 'gug', 'Sidama': 'sid', u'Punjabi': 'pa', 'Kalenjin': 'kln', 'Herero': 'hz', u'Kekchi': 'kek', 'Kisonge': 'sop', u'Latvian': 'lv', 'English': 'en', 'Mambwe-Lungu': 'mgr', 'Lingala': 'ln', u'Faeroese': 'fo', u'Chinese': 'zh_hant', 'Wayuunaiki': 'guc', 'Quichua': 'qus', 'Huave': 'huv', u'Tatar': 'tt', 'Kabyle': 'kab', 'Chin': 'cnh', u'Quiche': 'quc', 'Rapa': 'rap', 'Venda': 've', 'Tojolabal': 'toj', 'Swahili': 'sw', u'Icelandic': 'is', u'Turkish': 'tr', 'Kalanga': 'kck', 'Twi': 'tw', 'Waray-Waray': 'war', u'Kirghiz': 'ky', 'Guna': 'cuk', u'Gujarati': 'gu', u'Hindi': 'hi', 'Zapotec': 'zpg', u'Korean': 'ko', 'Malagasy': 'mg', 'Hungarian': 'hu', 'Igbo': 'ig', u'Lithuanian': 'lt', 'Greenlandic': 'kl', 'Tzeltal': 'tzh', 'Acholi': 'ach', u'Russian': 'ru', 'Romany': 'rmn', 'Croatian': 'hr', u'Kazakh': 'kk_cyrl', 'Tiv': 'tiv', 'Cebuano': 'ceb', u'Armenian': 'hy_armn', 'Sarnami': 'hns', 'Kikamba': 'kam', 'Toba': 'tob', 'Chol': 'ctu', 'Luvale': 'lue', 'Sepedi': 'nso', 'Mixe': 'mco', u'Greek': 'el', 'Sesotho': 'st', 'Hausa': 'ha', 'Isoko': 'iso', 'Irish': 'ga', 'Seychelles': 'crs', 'German': 'de', 'Runyankore': 'nyn', 'Kwanyama': 'kj', u'Macedonian': 'mk', u'Mongolian': 'mn', 'Aymara': 'ay', u'Mapudungun': 'arn', u'Sinhala': 'si', 'Ndonga': 'ng', u'Vietnamese': 'vi', u'Romanian': 'ro', 'Shona': 'sn', 'Dutch': 'nl', 'Swati': 'ss', 'Somali': 'so', 'Garifuna': 'cab', u'Nepali': 'ne', 'Tokelauan': 'tkl', 'Maya': 'yua', u'Ukrainian': 'uk', 'Welsh': 'cy', u'Mauritian': 'mfe', u'Mayo': 'mfy', 'Kisi': 'kiz', 'Tahitian': 'ty', u'Baoule': 'bci', u'Pilag\xe1': 'plg', 'Rarotongan': 'rar', 'Maltese': 'mt', 'Mam': 'mam', u'Cambodian': 'km', u'Kurdish': 'kmr_cyrl', u'Spanish': 'es', 'Tswana': 'tn', 'Kikaonde': 'kqn', 'Sango': 'sg', 'Oromo': 'om', u'Portuguese': 'pt', u'Huastec': 'hus', u'Myanmar': 'mya', u'Saramaccan': 'srm', 'Sranantongo': 'srn', 'Kiluba': 'lub', u'Japanese': 'ja', 'Kinyarwanda': 'rw', 'Lugbara': 'lgg', 'Ndebele': 'nr', 'Quechua': 'que', 'Kwangali': 'kwn', u'Tajiki': 'tg', u'Ossetian': 'os'}

## load_unicode_url.py
# loads the html content of an URL, extracts the text and returns it as unicode string

import urllib2
from bs4 import BeautifulSoup
import nltk

# loads url, extracts and returns text as unicode string
def get_text_from_url(url):

opener = urllib2.build_opener()
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:19.0) Gecko/20100101 Firefox/19.0')
html = opener.open(request).read()
# pass to BeautifulSoup before because it will determine the character encoding of the website and decode to Unicode
soup = BeautifulSoup(html)
html_unicode = unicode(soup)
text_unicode = nltk.clean_html(html_unicode)
return text_unicode

## nwt_textfinder.py
#! /usr/bin/python
# coding: utf-8 -*-

import sys
import time
import string
import logging
import urllib
import requests
import codecs
from bs4 import BeautifulSoup


# schema of URLs:
# http://www.jw.org/de/publikationen/bibel/nwt/bibelbuecher/prediger/4/#v21004007

# replace this dict and adapt baseurl to receive texts in another language
biblebooks = {"1. Korinther": 46, "Habakuk": 35, "Epheser": 49, "2. Timotheus": 55, "Haggai": 37, "1. Samuel": 9, "Johannes": 43, "Jona": 32, "Daniel": 27, "Zephanja": 36, "1. Petrus": 60, "2. Chronika": 14, "Ruth": 8, "Judas": 65, "1. Mose": 1, "Esther": 17, "Jakobus": 59, "Maleachi": 39, "1. Johannes": 62, "Klagelieder": 25, "2. Mose": 2, "Kolosser": 51, "2. Korinther": 47, "1. Könige": 11, "Prediger": 21, "Micha": 33, "Philipper": 50, "Galater": 48, "Josua": 6, "Markus": 41, "Joel": 29, "Lukas": 42, "Hohes Lied": 22, "Jeremia": 24, "Hosea": 28, "Hiob": 18, "1. Timotheus": 54, "Psalm": 19, "2. Thessalonicher": 53, "Nehemia": 16, "5. Mose": 5, "Amos": 30, "Obadja": 31, "Apostelgeschichte": 44, "1. Chronika": 13, "Richter": 7, "4. Mose": 4, "Nahum": 34, "Matthäus": 40, "Römer": 45, "Sprüche": 20, "3. Johannes": 64, "Jesaja": 23, "Hesekiel": 26, "Hebräer": 58, "Sacharja": 38, "Titus": 56, "Philemon": 57, "Esra": 15, "Offenbarung": 66, "2. Könige": 12, "3. Mose": 3, "2. Johannes": 63, "2. Samuel": 10, "2. Petrus": 61, "1. Thessalonicher": 52}

one_chapter_books = ['Obadja', 'Philemon', 'Judas', '2. Johannes', '3. Johannes']

for k in biblebooks.keys():
    biblebooks[unicode(k.decode('utf-8'))] = biblebooks.pop(k)


# for German bible texts only
baseurl = 'http://www.jw.org/de/publikationen/bibel/nwt/bibelbuecher/'


"""Takes as input the text's "address" as string or unicode and returns the text's content as unicode, downloaded from
the German New World Translation published by JW on jw.org.

Example call: nwt_textfinder.get_bible_text_german(u'Sprüche 18:10')
Returns: u'10\xa0\xa0Der Name Jehovas ist ein starker Turm.+ Der Gerechte l\xe4uft hinein und wird besch\xfctzt.*+\n\n'

Also downloads multiple texts like "1. Mose 1:1, 2" or "1. Mose 1:1-3"
"""
def get_bible_text_german(text):

  text = unicode(text.strip())
  text = text.replace(u'\xa0', u' ')
  while not text[-1].isdigit():
    text = text[:-1]

  logging.debug('Now starting to look up this text: %s' %text)

  if ':' in text:
    book_chapter, verse = text.split(':', 1)
    try:
      book, chapter = book_chapter.rsplit(' ', 1)
    except ValueError:
      logging.error('No valid bible text - there must be a space between book and chapter: %s' %text)
      return ''
  else:
    book_found = False
    for b in one_chapter_books:
      if b in text:
        logging.warning('This is a text from a book with one chapter: %s' %text)
        try:
          book, verse = text.rsplit(' ', 1)
        except ValueError:
          logging.error('No valid bible text - there must be a space between book and verse: %s' %text)
        chapter = '1'
        book_found = True

    if not book_found:
      logging.error('No valid text! There must be a colon between the chapter and the verse: %s' %text)
      return ''

  logging.debug('Book: %s' %book)
  logging.debug('Chapter: %s' %chapter)
  logging.debug('Verse: %s' %verse)

  if '-' in verse:
    start_verse, end_verse = verse.split('-')
    verselist = range(int(start_verse.strip()), int(end_verse.strip())+1)
    verselist = [str(v) for v in verselist]

  elif ',' in verse:
    a, b = verse.split(',')
    verselist = [a.strip(), b.strip()]
  else:
    verselist = [verse.replace(';','').replace(',','')]

  verselist = [v for v in verselist if v]

  logging.debug('Will now look up these verse(s): %s' %str(verselist))
  book = book.replace(u'\xa0', ' ')

  texturl = book.replace('. ', '-') + '/' + chapter + '/'

  r = requests.get(baseurl + texturl)

  soup = BeautifulSoup(r.content)
  try:
    bookid = str(biblebooks[book])
  except KeyError as err:
    logging.error('Book %s was not found! %s' %(book, err))

  result = ''

  for v in verselist:

    textid = 'v' +  bookid + (3 - len(chapter)) * '0' + chapter + (3 - len(v)) * '0' + v
    logging.debug('ID of the texts element: %s' %textid)
    souptext = soup.find(id=textid)
    if souptext:
      result += souptext.text + '\n'
    else:
      logging.error('No text found for this verse: %s' %v)
  return result


## Python_Code_Snippets
# various Python code snippets

## scrape_wiki_tables.py
#!/usr/bin/env python
# scrape Wikipedia table and export as .csv file

import urllib, urllib2
import unicodecsv as csv
import json
import codecs
from BeautifulSoup import BeautifulSoup


def main(url, table_names):

  all_data = list()
  fh = urllib.urlopen(url)
  content = fh.read()
  soup = BeautifulSoup(content)
  tables = soup.findAll('table', 'wikitable') #excludes vertical navigation box with article summary

  for table_no, table in enumerate(tables[:-1]):

    if table_no == 0:
      heads = [th.text for th in table.findAll('th')]
      all_data.append(heads + [u'Continent'])

    for row in table.findAll('tr'):
      row_data = []
      if row.findAll('th'):
        continue
      for table_data in row.findAll('td'):
        if table_data.a: row_data.append(table_data.a.text)
        else: row_data.append(table_data.text)

      row_data.append(table_names[table_no])
      all_data.append(row_data)

  # csv module cannot handle unicode (shame on you)
  all_data_utf8 = []
  for line in all_data:
    all_data_utf8.append([unicode(s).encode('utf-8') for s in line])

  out = open('all_data.csv', 'w')
  writer = csv.writer(out, dialect='excel', encoding='utf-8')
  for row in all_data_utf8:
    print row
    writer.writerow(row)
  out.close()

if __name__ == '__main__':


    url = r"https://en.wikipedia.org/wiki/Jehovah's_Witnesses_by_country"
    # TODO
    table_names =  [u'Africa', u'North America', u'Caribbean', u'South America', u'Asia', u'Europe', u'Oceania', u'Other']

    main(url, table_names)


## send2kindle.py
'''Send mobi files to kindle email address with python's email package'''
def send2kindle(f):

    msg = MIMEMultipart()
    msg['Subject'] = "Neue Zeitschriftenlieferung!"
    msg['From'] =
    msg['To'] = sender_email
    msg['Date'] = formatdate(localtime=True)
    message = "Neue Zeitschriften im Anhang"
    msg.attach(MIMEText(message))
    part = MIMEApplication(open(f, 'rb').read(), _subtype='application/x-mobipocket-ebook')
    part.add_header('Content-Disposition', 'attachment; filename="%s"' % os.path.basename(f))
    msg.attach(part)

    # connect to Google's SMTP server
    mailserver = smtplib.SMTP(smtpserver, port)
    mailserver.ehlo()
    mailserver.starttls()
    mailserver.ehlo()
    mailserver.login(sender_email, sender_password)
    mailserver.sendmail(sender_email, kindles_email, msg.as_string())
    mailserver.close()

## simple_prob_distribution.py
# choose a key from a dictionary randomly - the probability of a key getting chosen is commensurate to its value
# d ist dictionary mit index - count werten

def simpleProbDist(d):

i = 1
d2 = dict()
for k in d.keys():
r = (i, i+d[k])
#print r
i += d[k]
#print k
d2[k] = r

x = random.randint(1, i-1)
#print "x: " + str(x)
for k in d2.keys():
if x in range(*d2[k]):
chosen = k
break

return chosen

## string_manipulation.py
#remove punctuation:

import string

table = string.maketrans("","")
def rem_punct(s):
    return s.translate(table, string.punctuation)


#lemmatizing of nouns and verbs with the Nodebox Linguistics Library und WordNet
#perform POS tagging before to improve precision

def lemmatize(words):

  lemmas = []

  for w in words:
    if en.is_verb(w):
      lemmas.append(en.verb.infinitive(w.lower()))
    elif en.is_noun(w):
      lemmas.append(en.noun.singular(w.lower()))
    else:
      lemmas.append(w.lower())

  return lemmas

## things_i_dont_want_to_forget.py
# hopefully i won't forget them anymore when i wrote them down here :-)

#!/usr/bin/env python
# -*- coding: utf-8 -*-

str(datetime.datetime.now())

if __name__ == '__main__':

# only Python 2, not 3
except IOerror as e:
    print >> stderr, "IOError: %s" %e.strerror

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

# execute bash command
subprocess.Popen(['/bin/bash', '-c', cmd])
# get output of bash command in python
output = subprocess.check_output(['/bin/bash', '-c', cmd])
# get stderr
output = subprocess.check_output('ls', stderr=subprocess.STDOUT)


Pylab / Numpy etc.

    data = np.genfromtxt('all_absolute.csv', delimiter=';')

    data = np.array(1, 2, 3], [4, 5, 6)
    plt.imshow(data4, interpolation='nearest', cmap=plt.cm.bone)

## wikipedia.py
import wikipedia #https://github.com/goldsmith/Wikipedia
import urllib
import re

msg = 'Error!'

try:
    p = wikipedia.page(page_title)

# if you know only the page_title and url but not the pageid you have to
# find it in the html code of the wikipedia page before you can use
# the API functions

except wikipedia.DisambiguationError as err:

    print err
    fh = urllib.urlopen(baseurl + page_link)
    if fh:
      html = fh.read()
      r = re.search('"wgArticleId":(\d+),', html)
      groups = r.groups()
      if groups:
        pageid = groups[0]
        p = wikipedia.page(pageid=pageid)

      else:
        print msg
        return
    else:
      print msg
      return

## word2vec_model.py
#!/usr/bin/python

import codecs
import string
import re
import logging

import nltk
import gensim
import en

# before starting the script install Cython with `pip install cython` to use optimized word2vec training (70x speedup).
# Creates and saves a word2vec model from a big (some MB) file of raw text.

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
global table
table = string.maketrans("","")


def remove_punct(s):
  return s.translate(table, string.punctuation)


def lemmatize(words):

  lemmas = []

  for w in words:
    if en.is_verb(w):
      lemmas.append(en.verb.infinitive(w.lower()))
    elif en.is_noun(w):
      lemmas.append(en.noun.singular(w.lower()))
    else:
      lemmas.append(w.lower())
  return lemmas


class WordPreparer(object):

  def __init__(self, fname):
    self.fname = fname
    fh = codecs.open(fname, mode='r', encoding='utf-8')
    text = fh.read()
    self.sents = nltk.sent_tokenize(text)
    self.sents = [remove_punct(s.encode('utf-8')) for s in self.sents]

  def __iter__(self):
    for s in self.sents:
      yield lemmatize(remove_punct(s).split())


if __name__ == '__main__':

  words = WordPreparer('raw_text.txt')
  model = gensim.models.Word2Vec(words, min_count=3, workers=2)

  model.save('word2vec_model')
	#!/usr/bin/env python
	# -- coding: utf-8 --

	# converts csv to libsvm format
	# usage: python convert_to_libsvm.py (after having modified input and output filename)

	import codecs


	# change input filename here
	filename = "features.csv"
	fh = codecs.open(filename, "r", "utf-8")
	lines = fh.readlines()
	fh.close()

	# change output filename here
	out = "features.libsvm"
	fh = codecs.open(out, "w", "utf-8")


	for l in lines:

	l = l.split(";")
	lineout = l[0] + " "
	values = l[1:-2]
	for i in range(len(values)):

	# for some unknown reason libsvm starts index counting at 1, not at 0
	lineout += str(i+1) + ":" + values[i] + " "

	#lineout += " -1:\n"
	lineout += "\n"
	fh.write(lineout)


	fh.close()
	#download the free audio trainer German / French from Deutsche Welle

	import urllib

	for i in range(1, 10):
	urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion00' +str(i)+'_dwdownload.mp3', '/home/me/filename_' + str(i) + '.mp3')

	for i in range(10, 100):
	urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion0' +str(i)+'_dwdownload.mp3', '/home/me/filename_' + str(i) + '.mp3')

	urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion100_dwdownload.mp3', '/home/me/filename_100.mp3')
	#!/usr/bin/env python
	# -- coding: utf-8 --

	import json
	import urllib


	# performs a Google search with the title and returns the first URL's of the query result
	def search_web(title):

	words = "+".join(title.split())
	query = urllib.urlencode({'q':words})
	# the ajax google api only returns the first results but that is enough for our purpose
	response = urllib.urlopen('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
	json_response = json.loads(response)
	if json_response and json_response['responseData']:
	results = json_response['responseData']['results']
	else:
	return []
	urls = [result['url'] for result in results]
	print('Found ' + str(len(urls)) + ' URLs.')
	return urls
	# loads the html content of an URL, extracts the text and returns it as unicode string

	import urllib2
	from bs4 import BeautifulSoup
	import nltk

	# loads url, extracts and returns text as unicode string
	def get_text_from_url(url):

	opener = urllib2.build_opener()
	request = urllib2.Request(url)
	request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:19.0) Gecko/20100101 Firefox/19.0')
	html = opener.open(request).read()
	# pass to BeautifulSoup before because it will determine the character encoding of the website and decode to Unicode
	soup = BeautifulSoup(html)
	html_unicode = unicode(soup)
	text_unicode = nltk.clean_html(html_unicode)
	return text_unicode
	#! /usr/bin/python
	# coding: utf-8 -*-

	import sys
	import time
	import string
	import logging
	import urllib
	import requests
	import codecs
	from bs4 import BeautifulSoup


	# schema of URLs:
	# http://www.jw.org/de/publikationen/bibel/nwt/bibelbuecher/prediger/4/#v21004007

	# replace this dict and adapt baseurl to receive texts in another language
	biblebooks = {"1. Korinther": 46, "Habakuk": 35, "Epheser": 49, "2. Timotheus": 55, "Haggai": 37, "1. Samuel": 9, "Johannes": 43, "Jona": 32, "Daniel": 27, "Zephanja": 36, "1. Petrus": 60, "2. Chronika": 14, "Ruth": 8, "Judas": 65, "1. Mose": 1, "Esther": 17, "Jakobus": 59, "Maleachi": 39, "1. Johannes": 62, "Klagelieder": 25, "2. Mose": 2, "Kolosser": 51, "2. Korinther": 47, "1. Könige": 11, "Prediger": 21, "Micha": 33, "Philipper": 50, "Galater": 48, "Josua": 6, "Markus": 41, "Joel": 29, "Lukas": 42, "Hohes Lied": 22, "Jeremia": 24, "Hosea": 28, "Hiob": 18, "1. Timotheus": 54, "Psalm": 19, "2. Thessalonicher": 53, "Nehemia": 16, "5. Mose": 5, "Amos": 30, "Obadja": 31, "Apostelgeschichte": 44, "1. Chronika": 13, "Richter": 7, "4. Mose": 4, "Nahum": 34, "Matthäus": 40, "Römer": 45, "Sprüche": 20, "3. Johannes": 64, "Jesaja": 23, "Hesekiel": 26, "Hebräer": 58, "Sacharja": 38, "Titus": 56, "Philemon": 57, "Esra": 15, "Offenbarung": 66, "2. Könige": 12, "3. Mose": 3, "2. Johannes": 63, "2. Samuel": 10, "2. Petrus": 61, "1. Thessalonicher": 52}

	one_chapter_books = ['Obadja', 'Philemon', 'Judas', '2. Johannes', '3. Johannes']

	for k in biblebooks.keys():
	biblebooks[unicode(k.decode('utf-8'))] = biblebooks.pop(k)


	# for German bible texts only
	baseurl = 'http://www.jw.org/de/publikationen/bibel/nwt/bibelbuecher/'


	"""Takes as input the text's "address" as string or unicode and returns the text's content as unicode, downloaded from
	the German New World Translation published by JW on jw.org.

	Example call: nwt_textfinder.get_bible_text_german(u'Sprüche 18:10')
	Returns: u'10\xa0\xa0Der Name Jehovas ist ein starker Turm.+ Der Gerechte l\xe4uft hinein und wird besch\xfctzt.*+\n\n'

	Also downloads multiple texts like "1. Mose 1:1, 2" or "1. Mose 1:1-3"
	"""
	def get_bible_text_german(text):

	text = unicode(text.strip())
	text = text.replace(u'\xa0', u' ')
	while not text[-1].isdigit():
	text = text[:-1]

	logging.debug('Now starting to look up this text: %s' %text)

	if ':' in text:
	book_chapter, verse = text.split(':', 1)
	try:
	book, chapter = book_chapter.rsplit(' ', 1)
	except ValueError:
	logging.error('No valid bible text - there must be a space between book and chapter: %s' %text)
	return ''
	else:
	book_found = False
	for b in one_chapter_books:
	if b in text:
	logging.warning('This is a text from a book with one chapter: %s' %text)
	try:
	book, verse = text.rsplit(' ', 1)
	except ValueError:
	logging.error('No valid bible text - there must be a space between book and verse: %s' %text)
	chapter = '1'
	book_found = True

	if not book_found:
	logging.error('No valid text! There must be a colon between the chapter and the verse: %s' %text)
	return ''

	logging.debug('Book: %s' %book)
	logging.debug('Chapter: %s' %chapter)
	logging.debug('Verse: %s' %verse)

	if '-' in verse:
	start_verse, end_verse = verse.split('-')
	verselist = range(int(start_verse.strip()), int(end_verse.strip())+1)
	verselist = [str(v) for v in verselist]

	elif ',' in verse:
	a, b = verse.split(',')
	verselist = [a.strip(), b.strip()]
	else:
	verselist = [verse.replace(';','').replace(',','')]

	verselist = [v for v in verselist if v]

	logging.debug('Will now look up these verse(s): %s' %str(verselist))
	book = book.replace(u'\xa0', ' ')

	texturl = book.replace('. ', '-') + '/' + chapter + '/'

	r = requests.get(baseurl + texturl)

	soup = BeautifulSoup(r.content)
	try:
	bookid = str(biblebooks[book])
	except KeyError as err:
	logging.error('Book %s was not found! %s' %(book, err))

	result = ''

	for v in verselist:

	textid = 'v' + bookid + (3 - len(chapter)) * '0' + chapter + (3 - len(v)) * '0' + v
	logging.debug('ID of the texts element: %s' %textid)
	souptext = soup.find(id=textid)
	if souptext:
	result += souptext.text + '\n'
	else:
	logging.error('No text found for this verse: %s' %v)
	return result
	#!/usr/bin/env python
	# scrape Wikipedia table and export as .csv file

	import urllib, urllib2
	import unicodecsv as csv
	import json
	import codecs
	from BeautifulSoup import BeautifulSoup


	def main(url, table_names):

	all_data = list()
	fh = urllib.urlopen(url)
	content = fh.read()
	soup = BeautifulSoup(content)
	tables = soup.findAll('table', 'wikitable') #excludes vertical navigation box with article summary

	for table_no, table in enumerate(tables[:-1]):

	if table_no == 0:
	heads = [th.text for th in table.findAll('th')]
	all_data.append(heads + [u'Continent'])

	for row in table.findAll('tr'):
	row_data = []
	if row.findAll('th'):
	continue
	for table_data in row.findAll('td'):
	if table_data.a: row_data.append(table_data.a.text)
	else: row_data.append(table_data.text)

	row_data.append(table_names[table_no])
	all_data.append(row_data)

	# csv module cannot handle unicode (shame on you)
	all_data_utf8 = []
	for line in all_data:
	all_data_utf8.append([unicode(s).encode('utf-8') for s in line])

	out = open('all_data.csv', 'w')
	writer = csv.writer(out, dialect='excel', encoding='utf-8')
	for row in all_data_utf8:
	print row
	writer.writerow(row)
	out.close()

	if __name__ == '__main__':


	url = r"https://en.wikipedia.org/wiki/Jehovah's_Witnesses_by_country"
	# TODO
	table_names = [u'Africa', u'North America', u'Caribbean', u'South America', u'Asia', u'Europe', u'Oceania', u'Other']

	main(url, table_names)
	'''Send mobi files to kindle email address with python's email package'''
	def send2kindle(f):

	msg = MIMEMultipart()
	msg['Subject'] = "Neue Zeitschriftenlieferung!"
	msg['From'] =
	msg['To'] = sender_email
	msg['Date'] = formatdate(localtime=True)
	message = "Neue Zeitschriften im Anhang"
	msg.attach(MIMEText(message))
	part = MIMEApplication(open(f, 'rb').read(), _subtype='application/x-mobipocket-ebook')
	part.add_header('Content-Disposition', 'attachment; filename="%s"' % os.path.basename(f))
	msg.attach(part)

	# connect to Google's SMTP server
	mailserver = smtplib.SMTP(smtpserver, port)
	mailserver.ehlo()
	mailserver.starttls()
	mailserver.ehlo()
	mailserver.login(sender_email, sender_password)
	mailserver.sendmail(sender_email, kindles_email, msg.as_string())
	mailserver.close()
	# choose a key from a dictionary randomly - the probability of a key getting chosen is commensurate to its value
	# d ist dictionary mit index - count werten

	def simpleProbDist(d):

	i = 1
	d2 = dict()
	for k in d.keys():
	r = (i, i+d[k])
	#print r
	i += d[k]
	#print k
	d2[k] = r

	x = random.randint(1, i-1)
	#print "x: " + str(x)
	for k in d2.keys():
	if x in range(*d2[k]):
	chosen = k
	break

	return chosen
	#remove punctuation:

	import string

	table = string.maketrans("","")
	def rem_punct(s):
	return s.translate(table, string.punctuation)



	#lemmatizing of nouns and verbs with the Nodebox Linguistics Library und WordNet
	#perform POS tagging before to improve precision

	def lemmatize(words):

	lemmas = []

	for w in words:
	if en.is_verb(w):
	lemmas.append(en.verb.infinitive(w.lower()))
	elif en.is_noun(w):
	lemmas.append(en.noun.singular(w.lower()))
	else:
	lemmas.append(w.lower())

	return lemmas
	# hopefully i won't forget them anymore when i wrote them down here :-)

	#!/usr/bin/env python
	# -- coding: utf-8 --

	str(datetime.datetime.now())

	if __name__ == '__main__':

	# only Python 2, not 3
	except IOerror as e:
	print >> stderr, "IOError: %s" %e.strerror

	import logging
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

	import warnings
	warnings.filterwarnings('ignore', category=DeprecationWarning)

	# execute bash command
	subprocess.Popen(['/bin/bash', '-c', cmd])
	# get output of bash command in python
	output = subprocess.check_output(['/bin/bash', '-c', cmd])
	# get stderr
	output = subprocess.check_output('ls', stderr=subprocess.STDOUT)


	Pylab / Numpy etc.

	data = np.genfromtxt('all_absolute.csv', delimiter=';')

	data = np.array(1, 2, 3], [4, 5, 6)
	plt.imshow(data4, interpolation='nearest', cmap=plt.cm.bone)