Skip to content

Instantly share code, notes, and snippets.

@bernikr
Last active April 12, 2018 11:45
Show Gist options
  • Save bernikr/02340d43bb0a6ab495818e9b961961cd to your computer and use it in GitHub Desktop.
Save bernikr/02340d43bb0a6ab495818e9b961961cd to your computer and use it in GitHub Desktop.
*.csv
*.pyc
.Rhistory
.RData
import requests
from bs4 import BeautifulSoup
import umlaut
class CitybikeAccount:
def __init__(self, username, password):
login_data = {}
login_data["username"] = username
login_data["password"] = password
# start a request session to store the login cookie
self.s = requests.Session()
# get the hidden login fields needed to login
frontpage = self.s.get("https://www.citybikewien.at/de")
fp = BeautifulSoup(frontpage.content, 'html.parser')
login = fp.find('form', id='mloginfrm')
hiddeninputs = login.find_all('input', type='hidden')
for i in hiddeninputs:
login_data[i['name']]=i['value']
# login to the site and save the cookie to the session
login_url = "https://www.citybikewien.at/de/component/users/?task=user.login&Itemid=101"
logedin = self.s.post(login_url, data=login_data)
soup = BeautifulSoup(logedin.content, 'html.parser')
user_name = soup.select(".user-name-data")
if(len(user_name) < 1):
print("invalid login")
exit()
self.username = user_name[1].get_text()[:-1]
def get_page_count(self):
# get the number of existing rows from the website
page = self.s.get("https://www.citybikewien.at/en/my-rides")
soup = BeautifulSoup(page.content, 'html.parser')
tab = soup.select('#content div + p')[0]
line_num = int(tab.get_text().split(' ')[0])
return(line_num/5+1)
def load_page(self,i):
data_url = "https://www.citybikewien.at/de/meine-fahrten?start=" + str((i-1)*5)
page = self.s.get(data_url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.select('#content table tbody')[0]
rows = []
for row in table.find_all('tr'):
output_row = []
# go through every cell in a row
for cell in row.find_all('td'):
# check if if it is a 'normal' cell with only one data field
children = cell.findChildren()
if len(children) <= 1:
output_row.append(cell.get_text())
else:
# if it contains a location and a date split it into two
output_row.append(children[0].get_text())
output_row.append(children[1].get_text() + ' ' + children[2].get_text())
# Cutoff the Euro-sign from the price and the 'm' from the elevation
output_row[5] = output_row[5][2:]
output_row[6] = output_row[6][:-2]
# remove newlines and replace umlaute
output_row = [umlaut.normalize(t.replace('\n', ' ').strip()) for t in output_row]
rows.append(output_row)
return rows
library(ggplot2)
rides <- read.csv("rides.csv")[,c('start_time','end_time', 'elevation')]
rides$start_time <- as.POSIXct(rides$start_time, format="%d.%m.%Y %H:%M")
rides$end_time <- as.POSIXct(rides$end_time, format="%d.%m.%Y %H:%M")
#rides <- subset(rides, start_time > as.POSIXct("2017-03-28"))
#rides <- subset(rides, end_time < as.POSIXct("2017-03-29"))
start_times = data.frame(time = rides$start_time, elevation=rep(0, length(rides$start_time)))
end_times = data.frame(time = rides$end_time, elevation=rides$elevation)
elevations = rbind(start_times, end_times)
elevations = elevations[order(elevations$time),]
elevations$total_elevation = cumsum(elevations$elevation)
plot(ggplot(aes(x = time, y = total_elevation), data = elevations) + geom_line())
import requests
from bs4 import BeautifulSoup
import csv
import getpass
import os.path
from datetime import datetime
from citybike import CitybikeAccount
outputfile = 'rides.csv'
last_existing_time = datetime.min # saves the time of the last known ride
# get the login data from the user
username = raw_input("Username: ")
password = getpass.getpass("Password: ")
# look if a csv file already exists so only new rides will be loaded
# if there are any errors (not existing, wrong content in column, etc) just assume the file needs to be (re)created
try:
with open(outputfile, 'r') as f:
last_existing_time = datetime.strptime(list(csv.reader(f))[-1][2], '%d.%m.%Y %H:%M')
except:
print("Error in reading existing file. It will be created or overwritten.")
# start a request session to store the login cookie
print("logging in")
my_acc = CitybikeAccount(username, password)
print("loged in as: " + my_acc.username)
# append the output rows to this array
output = []
pages = my_acc.get_page_count()
print(str(pages) + " pages found")
newdata = True #helper for aborting the double loop
# load all pages and add them to the outputs
for i in range(1, pages+1):
if(not newdata): #check if the inner loop was aborted
break
# load the current table
print("Loading page " + str(i) + "/" + str(pages))
# read the rows
for output_row in my_acc.load_page(i):
# check if the row is newer then the last ride from the csv
time = datetime.strptime(output_row[2], '%d.%m.%Y %H:%M')
if(time > last_existing_time):
# add the row to the output array
output.append(output_row)
else:
# stop the datacollection if the ride already exists
print("All new data loaded. Abort data collection")
newdata = False
break
# reverse the output array so the newest rides come last
output.reverse()
# write the output array to the csv
print("writing csv")
with open(outputfile, 'ab') as f:
writer = csv.writer(f)
# if it is a new file or has an error, delete the content and write a header
if(last_existing_time == datetime.min):
f.truncate()
writer.writerow(['date', 'start_station', 'start_time', 'end_station', 'end_time', 'price', 'elevation'])
writer.writerows(output)
import requests
import json
import csv
import umlaut as uml
print('get data from data.wien.gv.at')
data = requests.get('https://data.wien.gv.at/daten/geo?service=WFS&request=GetFeature&version=1.1.0&typeName=ogdwien:CITYBIKEOGD&srsName=EPSG:4326&outputFormat=json')
data = json.loads(data.content)
output = []
print('convert data')
for station in data['features']:
output_row = []
output_row.append(uml.normalize(station['properties']['STATION']))
output_row.append(station['geometry']['coordinates'][1])
output_row.append(station['geometry']['coordinates'][0])
output.append(output_row)
print('writing csv')
with open('stations.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerow(['station', 'lat', 'lon'])
writer.writerows(output)
library(ggmap)
library(ggplot2)
stations <- read.csv("stations.csv")
rides <- read.csv("rides.csv")[,c('start_station', 'end_station')]
location_box = c(right = 16.428252, bottom = 48.173532, left = 16.297446,top = 48.254632)
map <- get_stamenmap(bbox = location_box, zoom = 13, maptype = "toner-background")
map <- ggmap(map)
plot_rides <- function(){
rides_uniq <- subset(data.frame(table(rides)), Freq>0)
rides_uniq$start_lat <- stations$lat[match(rides_uniq$start_station, stations$station)]
rides_uniq$start_lon <- stations$lon[match(rides_uniq$start_station, stations$station)]
rides_uniq$end_lat <- stations$lat[match(rides_uniq$end_station, stations$station)]
rides_uniq$end_lon <- stations$lon[match(rides_uniq$end_station, stations$station)]
lines <- geom_segment(aes(x = start_lon,
y = start_lat,
xend=end_lon,
yend=end_lat,
alpha=Freq
),
data = rides_uniq,
color="red",
size = 1
)
plot(map + lines
+ xlab("") + ylab("")
+ theme(axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
plot.margin = unit(c(0, 0, -1, -1), 'lines')
)
)
}
plot_heatmap <- function(){
all_used_stations = data.frame(station=unlist(list(rides$start_station, rides$end_station)))
all_used_stations$lat <- stations$lat[match(all_used_stations$station, stations$station)]
all_used_stations$lon <- stations$lon[match(all_used_stations$station, stations$station)]
heatmap <- stat_density2d(aes(x = lon,
y = lat,
fill = ..level..,
alpha = ..level..
),
all_used_stations,
size = 0.1,
bins = 16,
geom = "polygon",
show.legend=F
)
plot(map + heatmap
+ scale_fill_gradient(low = "green", high = "red")
+ scale_alpha(range = c(0, 0.5))
+ xlab("") + ylab("")
+ theme(axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
plot.margin = unit(c(0, 0, -1, -1), 'lines'
)
)
)
}
# -*- coding: utf-8 -*-
def normalize(text):
chars = {u'ö':'oe',u'ä':'ae',u'ü':'ue',u'ß':'ss'}
for char in chars:
text = text.replace(char,chars[char])
return text
rides <- read.csv("rides.csv")[,c('date','start_time','end_time', 'elevation')]
rides$date = as.Date(rides$date, format="%d.%m.%Y")
rides$weekday = factor(weekdays(rides$date),levels=c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
barplot(table(rides$weekday))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment