bernikr/.gitignore

## .gitignore

*.csv
*.pyc
.Rhistory
.RData

## citybike.py
import requests
from bs4 import BeautifulSoup
import umlaut

class CitybikeAccount:
    def __init__(self, username, password):
        login_data = {}
        login_data["username"] = username
        login_data["password"] = password

        # start a request session to store the login cookie
        self.s = requests.Session()

        # get the hidden login fields needed to login
        frontpage = self.s.get("https://www.citybikewien.at/de")
        fp = BeautifulSoup(frontpage.content, 'html.parser')
        login = fp.find('form', id='mloginfrm')
        hiddeninputs = login.find_all('input', type='hidden')
        for i in hiddeninputs:
            login_data[i['name']]=i['value']

        # login to the site and save the cookie to the session

        login_url = "https://www.citybikewien.at/de/component/users/?task=user.login&Itemid=101"
        logedin = self.s.post(login_url, data=login_data)
        soup = BeautifulSoup(logedin.content, 'html.parser')
        user_name = soup.select(".user-name-data")
        if(len(user_name) < 1):
            print("invalid login")
            exit()
        self.username = user_name[1].get_text()[:-1]

    def get_page_count(self):
        # get the number of existing rows from the website
        page = self.s.get("https://www.citybikewien.at/en/my-rides")
        soup = BeautifulSoup(page.content, 'html.parser')
        tab = soup.select('#content div + p')[0]
        line_num = int(tab.get_text().split(' ')[0])
        return(line_num/5+1)

    def load_page(self,i):
        data_url = "https://www.citybikewien.at/de/meine-fahrten?start=" + str((i-1)*5)
        page = self.s.get(data_url)
        soup = BeautifulSoup(page.content, 'html.parser')
        table = soup.select('#content table tbody')[0]

        rows = []
        for row in table.find_all('tr'):
            output_row = []

            # go through every cell in a row
            for cell in row.find_all('td'):
                # check if if it is a 'normal' cell with only one data field
                children = cell.findChildren()
                if len(children) <= 1:
                    output_row.append(cell.get_text())
                else:
                    # if it contains a location and a date split it into two
                    output_row.append(children[0].get_text())
                    output_row.append(children[1].get_text() + ' ' + children[2].get_text())

            # Cutoff the Euro-sign from the price and the 'm' from the elevation
            output_row[5] = output_row[5][2:]
            output_row[6] = output_row[6][:-2]

            # remove newlines and replace umlaute
            output_row = [umlaut.normalize(t.replace('\n', ' ').strip()) for t in output_row]

            rows.append(output_row)

        return rows


## elevation.r
library(ggplot2)

rides <- read.csv("rides.csv")[,c('start_time','end_time', 'elevation')]

rides$start_time <- as.POSIXct(rides$start_time, format="%d.%m.%Y %H:%M")
rides$end_time <- as.POSIXct(rides$end_time, format="%d.%m.%Y %H:%M")

#rides <- subset(rides, start_time > as.POSIXct("2017-03-28"))
#rides <- subset(rides, end_time   < as.POSIXct("2017-03-29"))

start_times = data.frame(time = rides$start_time, elevation=rep(0, length(rides$start_time)))
end_times = data.frame(time = rides$end_time, elevation=rides$elevation)

elevations = rbind(start_times, end_times)
elevations = elevations[order(elevations$time),]
elevations$total_elevation = cumsum(elevations$elevation)

plot(ggplot(aes(x = time, y = total_elevation), data = elevations) + geom_line())

## get-citybike-rides.py
import requests
from bs4 import BeautifulSoup
import csv
import getpass
import os.path
from datetime import datetime
from citybike import CitybikeAccount

outputfile = 'rides.csv'
last_existing_time = datetime.min # saves the time of the last known ride

# get the login data from the user
username = raw_input("Username: ")
password = getpass.getpass("Password: ")

# look if a csv file already exists so only new rides will be loaded
# if there are any errors (not existing, wrong content in column, etc) just assume the file needs to be (re)created
try:
    with open(outputfile, 'r') as f:
        last_existing_time = datetime.strptime(list(csv.reader(f))[-1][2], '%d.%m.%Y %H:%M')
except:
    print("Error in reading existing file. It will be created or overwritten.")


# start a request session to store the login cookie
print("logging in")
my_acc = CitybikeAccount(username, password)
print("loged in as: " + my_acc.username)

# append the output rows to this array
output = []


pages = my_acc.get_page_count()
print(str(pages) + " pages found")


newdata = True #helper for aborting the double loop
# load all pages and add them to the outputs
for i in range(1, pages+1):
    if(not newdata): #check if the inner loop was aborted
        break

    # load the current table
    print("Loading page " + str(i) + "/" + str(pages))

    # read the rows
    for output_row in my_acc.load_page(i):
        # check if the row is newer then the last ride from the csv
        time = datetime.strptime(output_row[2], '%d.%m.%Y %H:%M')
        if(time > last_existing_time):
            # add the row to the output array
            output.append(output_row)
        else:
            # stop the datacollection if the ride already exists
            print("All new data loaded. Abort data collection")
            newdata = False
            break

# reverse the output array so the newest rides come last
output.reverse()

# write the output array to the csv
print("writing csv")
with open(outputfile, 'ab') as f:
    writer = csv.writer(f)
    # if it is a new file or has an error, delete the content and write a header
    if(last_existing_time == datetime.min):
        f.truncate()
        writer.writerow(['date', 'start_station', 'start_time', 'end_station', 'end_time', 'price', 'elevation'])
    writer.writerows(output)

## get-citybike-stations.py
import requests
import json
import csv
import umlaut as uml

print('get data from data.wien.gv.at')
data = requests.get('https://data.wien.gv.at/daten/geo?service=WFS&request=GetFeature&version=1.1.0&typeName=ogdwien:CITYBIKEOGD&srsName=EPSG:4326&outputFormat=json')
data = json.loads(data.content)

output = []

print('convert data')
for station in data['features']:
    output_row = []

    output_row.append(uml.normalize(station['properties']['STATION']))
    output_row.append(station['geometry']['coordinates'][1])
    output_row.append(station['geometry']['coordinates'][0])

    output.append(output_row)

print('writing csv')
with open('stations.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerow(['station', 'lat', 'lon'])
    writer.writerows(output)

## maps.r
library(ggmap)
library(ggplot2)

stations <- read.csv("stations.csv")
rides <- read.csv("rides.csv")[,c('start_station', 'end_station')]


location_box = c(right = 16.428252, bottom = 48.173532, left = 16.297446,top = 48.254632)

map <- get_stamenmap(bbox = location_box, zoom = 13, maptype = "toner-background")
map <- ggmap(map)

plot_rides <- function(){
  rides_uniq <- subset(data.frame(table(rides)), Freq>0)
  rides_uniq$start_lat <- stations$lat[match(rides_uniq$start_station, stations$station)]
  rides_uniq$start_lon <- stations$lon[match(rides_uniq$start_station, stations$station)]
  rides_uniq$end_lat <- stations$lat[match(rides_uniq$end_station, stations$station)]
  rides_uniq$end_lon <- stations$lon[match(rides_uniq$end_station, stations$station)]

  lines <- geom_segment(aes(x = start_lon,
                            y = start_lat,
                            xend=end_lon,
                            yend=end_lat,
                            alpha=Freq
                            ),
                        data = rides_uniq,
                        color="red",
                        size = 1
                        )

  plot(map + lines
       + xlab("") + ylab("")
       + theme(axis.line = element_blank(),
               axis.text = element_blank(),
               axis.ticks = element_blank(),
               plot.margin = unit(c(0, 0, -1, -1), 'lines')
               )
       )
}

plot_heatmap <- function(){
  all_used_stations = data.frame(station=unlist(list(rides$start_station, rides$end_station)))
  all_used_stations$lat <- stations$lat[match(all_used_stations$station, stations$station)]
  all_used_stations$lon <- stations$lon[match(all_used_stations$station, stations$station)]

  heatmap <- stat_density2d(aes(x = lon,
                                y = lat,
                                fill = ..level..,
                                alpha = ..level..
                                ),
                            all_used_stations,
                            size = 0.1,
                            bins = 16,
                            geom = "polygon",
                            show.legend=F
                            )

  plot(map + heatmap
       + scale_fill_gradient(low = "green", high = "red")
       + scale_alpha(range = c(0, 0.5))
       + xlab("") + ylab("")
       + theme(axis.line = element_blank(),
               axis.text = element_blank(),
               axis.ticks = element_blank(),
               plot.margin = unit(c(0, 0, -1, -1), 'lines'
               )
       )
  )
}

## umlaut.py
# -*- coding: utf-8 -*-

def normalize(text):
    chars = {u'ö':'oe',u'ä':'ae',u'ü':'ue',u'ß':'ss'}
    for char in chars:
        text = text.replace(char,chars[char])
    return text

## weekdays.r
rides <- read.csv("rides.csv")[,c('date','start_time','end_time', 'elevation')]

rides$date = as.Date(rides$date, format="%d.%m.%Y")
rides$weekday = factor(weekdays(rides$date),levels=c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
barplot(table(rides$weekday))
	import requests
	from bs4 import BeautifulSoup
	import umlaut

	class CitybikeAccount:
	def __init__(self, username, password):
	login_data = {}
	login_data["username"] = username
	login_data["password"] = password

	# start a request session to store the login cookie
	self.s = requests.Session()

	# get the hidden login fields needed to login
	frontpage = self.s.get("https://www.citybikewien.at/de")
	fp = BeautifulSoup(frontpage.content, 'html.parser')
	login = fp.find('form', id='mloginfrm')
	hiddeninputs = login.find_all('input', type='hidden')
	for i in hiddeninputs:
	login_data[i['name']]=i['value']

	# login to the site and save the cookie to the session

	login_url = "https://www.citybikewien.at/de/component/users/?task=user.login&Itemid=101"
	logedin = self.s.post(login_url, data=login_data)
	soup = BeautifulSoup(logedin.content, 'html.parser')
	user_name = soup.select(".user-name-data")
	if(len(user_name) < 1):
	print("invalid login")
	exit()
	self.username = user_name[1].get_text()[:-1]

	def get_page_count(self):
	# get the number of existing rows from the website
	page = self.s.get("https://www.citybikewien.at/en/my-rides")
	soup = BeautifulSoup(page.content, 'html.parser')
	tab = soup.select('#content div + p')[0]
	line_num = int(tab.get_text().split(' ')[0])
	return(line_num/5+1)

	def load_page(self,i):
	data_url = "https://www.citybikewien.at/de/meine-fahrten?start=" + str((i-1)*5)
	page = self.s.get(data_url)
	soup = BeautifulSoup(page.content, 'html.parser')
	table = soup.select('#content table tbody')[0]

	rows = []
	for row in table.find_all('tr'):
	output_row = []

	# go through every cell in a row
	for cell in row.find_all('td'):
	# check if if it is a 'normal' cell with only one data field
	children = cell.findChildren()
	if len(children) <= 1:
	output_row.append(cell.get_text())
	else:
	# if it contains a location and a date split it into two
	output_row.append(children[0].get_text())
	output_row.append(children[1].get_text() + ' ' + children[2].get_text())

	# Cutoff the Euro-sign from the price and the 'm' from the elevation
	output_row[5] = output_row[5][2:]
	output_row[6] = output_row[6][:-2]

	# remove newlines and replace umlaute
	output_row = [umlaut.normalize(t.replace('\n', ' ').strip()) for t in output_row]

	rows.append(output_row)

	return rows
	library(ggplot2)

	rides <- read.csv("rides.csv")[,c('start_time','end_time', 'elevation')]

	rides$start_time <- as.POSIXct(rides$start_time, format="%d.%m.%Y %H:%M")
	rides$end_time <- as.POSIXct(rides$end_time, format="%d.%m.%Y %H:%M")

	#rides <- subset(rides, start_time > as.POSIXct("2017-03-28"))
	#rides <- subset(rides, end_time < as.POSIXct("2017-03-29"))

	start_times = data.frame(time = rides$start_time, elevation=rep(0, length(rides$start_time)))
	end_times = data.frame(time = rides$end_time, elevation=rides$elevation)

	elevations = rbind(start_times, end_times)
	elevations = elevations[order(elevations$time),]
	elevations$total_elevation = cumsum(elevations$elevation)

	plot(ggplot(aes(x = time, y = total_elevation), data = elevations) + geom_line())
	import requests
	from bs4 import BeautifulSoup
	import csv
	import getpass
	import os.path
	from datetime import datetime
	from citybike import CitybikeAccount

	outputfile = 'rides.csv'
	last_existing_time = datetime.min # saves the time of the last known ride

	# get the login data from the user
	username = raw_input("Username: ")
	password = getpass.getpass("Password: ")

	# look if a csv file already exists so only new rides will be loaded
	# if there are any errors (not existing, wrong content in column, etc) just assume the file needs to be (re)created
	try:
	with open(outputfile, 'r') as f:
	last_existing_time = datetime.strptime(list(csv.reader(f))[-1][2], '%d.%m.%Y %H:%M')
	except:
	print("Error in reading existing file. It will be created or overwritten.")


	# start a request session to store the login cookie
	print("logging in")
	my_acc = CitybikeAccount(username, password)
	print("loged in as: " + my_acc.username)

	# append the output rows to this array
	output = []


	pages = my_acc.get_page_count()
	print(str(pages) + " pages found")


	newdata = True #helper for aborting the double loop
	# load all pages and add them to the outputs
	for i in range(1, pages+1):
	if(not newdata): #check if the inner loop was aborted
	break

	# load the current table
	print("Loading page " + str(i) + "/" + str(pages))

	# read the rows
	for output_row in my_acc.load_page(i):
	# check if the row is newer then the last ride from the csv
	time = datetime.strptime(output_row[2], '%d.%m.%Y %H:%M')
	if(time > last_existing_time):
	# add the row to the output array
	output.append(output_row)
	else:
	# stop the datacollection if the ride already exists
	print("All new data loaded. Abort data collection")
	newdata = False
	break

	# reverse the output array so the newest rides come last
	output.reverse()

	# write the output array to the csv
	print("writing csv")
	with open(outputfile, 'ab') as f:
	writer = csv.writer(f)
	# if it is a new file or has an error, delete the content and write a header
	if(last_existing_time == datetime.min):
	f.truncate()
	writer.writerow(['date', 'start_station', 'start_time', 'end_station', 'end_time', 'price', 'elevation'])
	writer.writerows(output)
	import requests
	import json
	import csv
	import umlaut as uml

	print('get data from data.wien.gv.at')
	data = requests.get('https://data.wien.gv.at/daten/geo?service=WFS&request=GetFeature&version=1.1.0&typeName=ogdwien:CITYBIKEOGD&srsName=EPSG:4326&outputFormat=json')
	data = json.loads(data.content)

	output = []

	print('convert data')
	for station in data['features']:
	output_row = []

	output_row.append(uml.normalize(station['properties']['STATION']))
	output_row.append(station['geometry']['coordinates'][1])
	output_row.append(station['geometry']['coordinates'][0])

	output.append(output_row)

	print('writing csv')
	with open('stations.csv', 'wb') as f:
	writer = csv.writer(f)
	writer.writerow(['station', 'lat', 'lon'])
	writer.writerows(output)
	library(ggmap)
	library(ggplot2)

	stations <- read.csv("stations.csv")
	rides <- read.csv("rides.csv")[,c('start_station', 'end_station')]


	location_box = c(right = 16.428252, bottom = 48.173532, left = 16.297446,top = 48.254632)

	map <- get_stamenmap(bbox = location_box, zoom = 13, maptype = "toner-background")
	map <- ggmap(map)

	plot_rides <- function(){
	rides_uniq <- subset(data.frame(table(rides)), Freq>0)
	rides_uniq$start_lat <- stations$lat[match(rides_uniq$start_station, stations$station)]
	rides_uniq$start_lon <- stations$lon[match(rides_uniq$start_station, stations$station)]
	rides_uniq$end_lat <- stations$lat[match(rides_uniq$end_station, stations$station)]
	rides_uniq$end_lon <- stations$lon[match(rides_uniq$end_station, stations$station)]

	lines <- geom_segment(aes(x = start_lon,
	y = start_lat,
	xend=end_lon,
	yend=end_lat,
	alpha=Freq
	),
	data = rides_uniq,
	color="red",
	size = 1
	)

	plot(map + lines
	+ xlab("") + ylab("")
	+ theme(axis.line = element_blank(),
	axis.text = element_blank(),
	axis.ticks = element_blank(),
	plot.margin = unit(c(0, 0, -1, -1), 'lines')
	)
	)
	}

	plot_heatmap <- function(){
	all_used_stations = data.frame(station=unlist(list(rides$start_station, rides$end_station)))
	all_used_stations$lat <- stations$lat[match(all_used_stations$station, stations$station)]
	all_used_stations$lon <- stations$lon[match(all_used_stations$station, stations$station)]

	heatmap <- stat_density2d(aes(x = lon,
	y = lat,
	fill = ..level..,
	alpha = ..level..
	),
	all_used_stations,
	size = 0.1,
	bins = 16,
	geom = "polygon",
	show.legend=F
	)

	plot(map + heatmap
	+ scale_fill_gradient(low = "green", high = "red")
	+ scale_alpha(range = c(0, 0.5))
	+ xlab("") + ylab("")
	+ theme(axis.line = element_blank(),
	axis.text = element_blank(),
	axis.ticks = element_blank(),
	plot.margin = unit(c(0, 0, -1, -1), 'lines'
	)
	)
	)
	}
	# -- coding: utf-8 --

	def normalize(text):
	chars = {u'ö':'oe',u'ä':'ae',u'ü':'ue',u'ß':'ss'}
	for char in chars:
	text = text.replace(char,chars[char])
	return text
	rides <- read.csv("rides.csv")[,c('date','start_time','end_time', 'elevation')]

	rides$date = as.Date(rides$date, format="%d.%m.%Y")
	rides$weekday = factor(weekdays(rides$date),levels=c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
	barplot(table(rides$weekday))