Last active
April 12, 2018 11:45
-
-
Save bernikr/02340d43bb0a6ab495818e9b961961cd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.csv | |
*.pyc | |
.Rhistory | |
.RData |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import umlaut | |
class CitybikeAccount: | |
def __init__(self, username, password): | |
login_data = {} | |
login_data["username"] = username | |
login_data["password"] = password | |
# start a request session to store the login cookie | |
self.s = requests.Session() | |
# get the hidden login fields needed to login | |
frontpage = self.s.get("https://www.citybikewien.at/de") | |
fp = BeautifulSoup(frontpage.content, 'html.parser') | |
login = fp.find('form', id='mloginfrm') | |
hiddeninputs = login.find_all('input', type='hidden') | |
for i in hiddeninputs: | |
login_data[i['name']]=i['value'] | |
# login to the site and save the cookie to the session | |
login_url = "https://www.citybikewien.at/de/component/users/?task=user.login&Itemid=101" | |
logedin = self.s.post(login_url, data=login_data) | |
soup = BeautifulSoup(logedin.content, 'html.parser') | |
user_name = soup.select(".user-name-data") | |
if(len(user_name) < 1): | |
print("invalid login") | |
exit() | |
self.username = user_name[1].get_text()[:-1] | |
def get_page_count(self): | |
# get the number of existing rows from the website | |
page = self.s.get("https://www.citybikewien.at/en/my-rides") | |
soup = BeautifulSoup(page.content, 'html.parser') | |
tab = soup.select('#content div + p')[0] | |
line_num = int(tab.get_text().split(' ')[0]) | |
return(line_num/5+1) | |
def load_page(self,i): | |
data_url = "https://www.citybikewien.at/de/meine-fahrten?start=" + str((i-1)*5) | |
page = self.s.get(data_url) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
table = soup.select('#content table tbody')[0] | |
rows = [] | |
for row in table.find_all('tr'): | |
output_row = [] | |
# go through every cell in a row | |
for cell in row.find_all('td'): | |
# check if if it is a 'normal' cell with only one data field | |
children = cell.findChildren() | |
if len(children) <= 1: | |
output_row.append(cell.get_text()) | |
else: | |
# if it contains a location and a date split it into two | |
output_row.append(children[0].get_text()) | |
output_row.append(children[1].get_text() + ' ' + children[2].get_text()) | |
# Cutoff the Euro-sign from the price and the 'm' from the elevation | |
output_row[5] = output_row[5][2:] | |
output_row[6] = output_row[6][:-2] | |
# remove newlines and replace umlaute | |
output_row = [umlaut.normalize(t.replace('\n', ' ').strip()) for t in output_row] | |
rows.append(output_row) | |
return rows | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
rides <- read.csv("rides.csv")[,c('start_time','end_time', 'elevation')] | |
rides$start_time <- as.POSIXct(rides$start_time, format="%d.%m.%Y %H:%M") | |
rides$end_time <- as.POSIXct(rides$end_time, format="%d.%m.%Y %H:%M") | |
#rides <- subset(rides, start_time > as.POSIXct("2017-03-28")) | |
#rides <- subset(rides, end_time < as.POSIXct("2017-03-29")) | |
start_times = data.frame(time = rides$start_time, elevation=rep(0, length(rides$start_time))) | |
end_times = data.frame(time = rides$end_time, elevation=rides$elevation) | |
elevations = rbind(start_times, end_times) | |
elevations = elevations[order(elevations$time),] | |
elevations$total_elevation = cumsum(elevations$elevation) | |
plot(ggplot(aes(x = time, y = total_elevation), data = elevations) + geom_line()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import csv | |
import getpass | |
import os.path | |
from datetime import datetime | |
from citybike import CitybikeAccount | |
outputfile = 'rides.csv' | |
last_existing_time = datetime.min # saves the time of the last known ride | |
# get the login data from the user | |
username = raw_input("Username: ") | |
password = getpass.getpass("Password: ") | |
# look if a csv file already exists so only new rides will be loaded | |
# if there are any errors (not existing, wrong content in column, etc) just assume the file needs to be (re)created | |
try: | |
with open(outputfile, 'r') as f: | |
last_existing_time = datetime.strptime(list(csv.reader(f))[-1][2], '%d.%m.%Y %H:%M') | |
except: | |
print("Error in reading existing file. It will be created or overwritten.") | |
# start a request session to store the login cookie | |
print("logging in") | |
my_acc = CitybikeAccount(username, password) | |
print("loged in as: " + my_acc.username) | |
# append the output rows to this array | |
output = [] | |
pages = my_acc.get_page_count() | |
print(str(pages) + " pages found") | |
newdata = True #helper for aborting the double loop | |
# load all pages and add them to the outputs | |
for i in range(1, pages+1): | |
if(not newdata): #check if the inner loop was aborted | |
break | |
# load the current table | |
print("Loading page " + str(i) + "/" + str(pages)) | |
# read the rows | |
for output_row in my_acc.load_page(i): | |
# check if the row is newer then the last ride from the csv | |
time = datetime.strptime(output_row[2], '%d.%m.%Y %H:%M') | |
if(time > last_existing_time): | |
# add the row to the output array | |
output.append(output_row) | |
else: | |
# stop the datacollection if the ride already exists | |
print("All new data loaded. Abort data collection") | |
newdata = False | |
break | |
# reverse the output array so the newest rides come last | |
output.reverse() | |
# write the output array to the csv | |
print("writing csv") | |
with open(outputfile, 'ab') as f: | |
writer = csv.writer(f) | |
# if it is a new file or has an error, delete the content and write a header | |
if(last_existing_time == datetime.min): | |
f.truncate() | |
writer.writerow(['date', 'start_station', 'start_time', 'end_station', 'end_time', 'price', 'elevation']) | |
writer.writerows(output) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import csv | |
import umlaut as uml | |
print('get data from data.wien.gv.at') | |
data = requests.get('https://data.wien.gv.at/daten/geo?service=WFS&request=GetFeature&version=1.1.0&typeName=ogdwien:CITYBIKEOGD&srsName=EPSG:4326&outputFormat=json') | |
data = json.loads(data.content) | |
output = [] | |
print('convert data') | |
for station in data['features']: | |
output_row = [] | |
output_row.append(uml.normalize(station['properties']['STATION'])) | |
output_row.append(station['geometry']['coordinates'][1]) | |
output_row.append(station['geometry']['coordinates'][0]) | |
output.append(output_row) | |
print('writing csv') | |
with open('stations.csv', 'wb') as f: | |
writer = csv.writer(f) | |
writer.writerow(['station', 'lat', 'lon']) | |
writer.writerows(output) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggmap) | |
library(ggplot2) | |
stations <- read.csv("stations.csv") | |
rides <- read.csv("rides.csv")[,c('start_station', 'end_station')] | |
location_box = c(right = 16.428252, bottom = 48.173532, left = 16.297446,top = 48.254632) | |
map <- get_stamenmap(bbox = location_box, zoom = 13, maptype = "toner-background") | |
map <- ggmap(map) | |
plot_rides <- function(){ | |
rides_uniq <- subset(data.frame(table(rides)), Freq>0) | |
rides_uniq$start_lat <- stations$lat[match(rides_uniq$start_station, stations$station)] | |
rides_uniq$start_lon <- stations$lon[match(rides_uniq$start_station, stations$station)] | |
rides_uniq$end_lat <- stations$lat[match(rides_uniq$end_station, stations$station)] | |
rides_uniq$end_lon <- stations$lon[match(rides_uniq$end_station, stations$station)] | |
lines <- geom_segment(aes(x = start_lon, | |
y = start_lat, | |
xend=end_lon, | |
yend=end_lat, | |
alpha=Freq | |
), | |
data = rides_uniq, | |
color="red", | |
size = 1 | |
) | |
plot(map + lines | |
+ xlab("") + ylab("") | |
+ theme(axis.line = element_blank(), | |
axis.text = element_blank(), | |
axis.ticks = element_blank(), | |
plot.margin = unit(c(0, 0, -1, -1), 'lines') | |
) | |
) | |
} | |
plot_heatmap <- function(){ | |
all_used_stations = data.frame(station=unlist(list(rides$start_station, rides$end_station))) | |
all_used_stations$lat <- stations$lat[match(all_used_stations$station, stations$station)] | |
all_used_stations$lon <- stations$lon[match(all_used_stations$station, stations$station)] | |
heatmap <- stat_density2d(aes(x = lon, | |
y = lat, | |
fill = ..level.., | |
alpha = ..level.. | |
), | |
all_used_stations, | |
size = 0.1, | |
bins = 16, | |
geom = "polygon", | |
show.legend=F | |
) | |
plot(map + heatmap | |
+ scale_fill_gradient(low = "green", high = "red") | |
+ scale_alpha(range = c(0, 0.5)) | |
+ xlab("") + ylab("") | |
+ theme(axis.line = element_blank(), | |
axis.text = element_blank(), | |
axis.ticks = element_blank(), | |
plot.margin = unit(c(0, 0, -1, -1), 'lines' | |
) | |
) | |
) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
def normalize(text): | |
chars = {u'ö':'oe',u'ä':'ae',u'ü':'ue',u'ß':'ss'} | |
for char in chars: | |
text = text.replace(char,chars[char]) | |
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rides <- read.csv("rides.csv")[,c('date','start_time','end_time', 'elevation')] | |
rides$date = as.Date(rides$date, format="%d.%m.%Y") | |
rides$weekday = factor(weekdays(rides$date),levels=c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")) | |
barplot(table(rides$weekday)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment