gitlog.sh - gets Apache CloudStack repository and exports git history data to a csv file
download-mboxes.sh - downloads various CloudStack ML archives
parse-mbox-data.sh - parses downloaded mbox archives and exports ML data to a csv file
#!/bin/bash | |
set -e | |
mkdir -p mbox | |
cd mbox | |
ROOT=$PWD | |
for list in dev users users-cn marketing | |
do | |
cd $ROOT | |
mkdir -p $list | |
cd $list | |
URL_LIST="" | |
# 2014 | |
for month in $(seq -f "%02g" 1 12) | |
do | |
URL_LIST="$URL_LIST http://mail-archives.apache.org/mod_mbox/cloudstack-$list/2014$month.mbox" | |
done | |
# 2015 | |
for month in $(seq -f "%02g" 1 9) | |
do | |
URL_LIST="$URL_LIST http://mail-archives.apache.org/mod_mbox/cloudstack-$list/2015$month.mbox" | |
done | |
# 2016 | |
for month in $(seq -f "%02g" 1 6) | |
do | |
URL_LIST="$URL_LIST http://mail-archives.apache.org/mod_mbox/cloudstack-$list/2016$month.mbox" | |
done | |
# Synchorize and do parallel download | |
echo "Downloading files: $URL_LIST" | |
echo $URL_LIST | xargs -n 1 -P 8 wget | |
done |
git clone https://git-wip-us.apache.org/repos/asf/cloudstack.git | |
cd cloudstack && git log --format='%H;%ad;%at;%aN;%aE;%cN;%cE' > ../gitlog.csv |
#!/bin/env python | |
from os import listdir | |
mboxLocation = "./mbox/" | |
lists = ["dev", "marketing", "users", "users-cn"] | |
for ml in lists: | |
csv = [] | |
outputFile = ml + ".csv" | |
for file in listdir(mboxLocation + ml): | |
with open(mboxLocation + ml + "/" + file, "r") as f: | |
blockFound = False | |
date = None | |
fromField = None | |
msgId = None | |
count = 0 | |
entry = [] | |
for line in f: | |
if line.startswith("From " + ml): | |
blockFound = True | |
count = 0 | |
if blockFound: | |
if line.startswith("Date: "): | |
date = line.split("Date: ")[1].strip() | |
count += 1 | |
if line.startswith("From: "): | |
fromField = line.split("From: ")[1].strip() | |
if "<" in fromField: | |
fromField = fromField.split("<")[1].split(">")[0] | |
count += 1 | |
if line.startswith("Message-ID: "): | |
msgId = line.split("Message-ID: ")[1].strip() | |
count += 1 | |
if count == 3: | |
csv.append([date, fromField, msgId]) | |
count = 0 | |
blockFound = False | |
if count > 3: | |
print "This cannot happen" | |
with open(ml + ".csv", "w") as f: | |
for entry in csv: | |
f.write(";".join(entry) + "\n") |