Skip to content

Instantly share code, notes, and snippets.

@AlexandraKapp
Last active October 25, 2022 10:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AlexandraKapp/296f79e77e3b7772f8af486a72689da0 to your computer and use it in GitHub Desktop.
Save AlexandraKapp/296f79e77e3b7772f8af486a72689da0 to your computer and use it in GitHub Desktop.
# A Script to
# - download the GeoLife (https://www.microsoft.com/en-us/research/publication/geolife-gps-trajectory-dataset-user-guide/) data
# - transform it to a .csv file
# - and cut records to given outlines
# not very performant bc of preprocessing with pandas (takes about 30 min)
# produces a 1,8 GB output csv file
import os
from pathlib import Path
import csv
import numpy as np
import pandas as pd
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from tqdm.auto import tqdm
##### INPUT VARIABLES #####
# set path names
RAW_DATA_PATH="raw/geolife"
PROCESSED_DATA_PATH ="preprocessed"
# set geo boundaries
CUT_RECORDS_TO_BOUNDARY = True
LNG_MIN=116.08
LNG_MAX=116.69
LAT_MIN=39.66
LAT_MAX=40.27
############ Download data ###############
# GEOLIFE
if not os.path.exists(RAW_DATA_PATH):
with tqdm(total=1, desc="Download geolife data",) as pbar: # progress bar
os.makedirs(RAW_DATA_PATH)
url = "https://download.microsoft.com/download/F/4/8/F4894AA5-FDBC-481E-9285-D5F8C4C4F039/Geolife%20Trajectories%201.3.zip"
with urlopen(url) as zipresp:
with ZipFile(BytesIO(zipresp.read())) as zfile:
zfile.extractall(
RAW_DATA_PATH
)
pbar.update()
else:
print("Geolife data already exists. Download is skipped.")
############ Preprocess data ###############
#### FUNCTIONS ####
# clean header of plt files and write all data into single csv
def geolife_clean_plt(root, user_id, input_filepath, traj_id):
# read plt file
with open(root + "/" + user_id + "/Trajectory/" + input_filepath, "rt") as fin:
cr = csv.reader(fin)
filecontents = [line for line in cr][6:]
for l in filecontents:
l.insert(0, traj_id)
l.insert(0, user_id)
return filecontents
def geolife_data_to_df(dir):
data = []
col_names = ["uid", "tid", "lat", "lng", "-", "Alt", "dayNo", "date", "time"]
user_id_dirs = [
name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))
]
with tqdm(total=len(user_id_dirs), desc="Preprocess Geolife data",) as pbar: # progress bar
for user_id in np.sort(user_id_dirs):
tempdirs = os.listdir(dir + "/" + user_id + "/Trajectory")
subdirs = []
for item in tempdirs:
if not item.endswith(".DS_Store"):
subdirs.append(item)
traj_id = 0
for subdir in subdirs:
data += geolife_clean_plt(dir, user_id, subdir, traj_id)
traj_id = traj_id + 1
pbar.update()
return pd.DataFrame(data, columns=col_names)
#####
##### SCRIPT #####
if Path(os.path.join(PROCESSED_DATA_PATH, "geolife.csv")).exists():
print("Geolife data is already preprocessed. Processing is skipped.")
df = pd.read_csv(os.path.join(PROCESSED_DATA_PATH, "geolife.csv"))
else:
if not Path(PROCESSED_DATA_PATH).exists():
os.makedirs(PROCESSED_DATA_PATH)
geolife_dir = os.path.join(
RAW_DATA_PATH, "Geolife Trajectories 1.3", "Data"
)
df = geolife_data_to_df(geolife_dir)
df["datetime"] = df.date + " " + df.time
df["datetime"] = pd.to_datetime(df.datetime)
df.drop("date", inplace=True, axis=1)
df.drop("time", inplace=True, axis=1)
## fix datetime timezone
df["datetime"] = (
df["datetime"]
.dt.tz_localize("GMT")
.dt.tz_convert("Asia/Shanghai")
.dt.tz_localize(None)
)
df.to_csv(os.path.join(PROCESSED_DATA_PATH, "geolife.csv"), index=False)
############ Cut to outline of given boundary ###############
if CUT_RECORDS_TO_BOUNDARY:
print("Records are cut to outline of given boundary.")
df.lat = df.lat.astype(float)
df.lng = df.lng.astype(float)
df = df[(df.lat > LAT_MIN) & (df.lat < LAT_MAX) & (df.lng > LNG_MIN) & (df.lng < LNG_MAX)]
df.to_csv(os.path.join(PROCESSED_DATA_PATH, "geolife_in_boundary.csv"), index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment