Skip to content

Instantly share code, notes, and snippets.

@fmassot
Last active December 30, 2021 14:48
Show Gist options
  • Save fmassot/f70810dee26597eab0b42a22e8c5cb02 to your computer and use it in GitHub Desktop.
Save fmassot/f70810dee26597eab0b42a22e8c5cb02 to your computer and use it in GitHub Desktop.
gh-archive dataset build script
# Build monthly dataset
year=$1
month=$2
ch_json_output_filename=gh-archive-${year}-${month}.json
ch_gz_output_filename=gh-archive-${year}-${month}.json.gz
quickwit_gz_output_filename=gh-archive-${year}-${month}-text-only.json.gz
s3_ls_result=`aws s3 ls s3://quickwit-datasets-public/gh-archive/${ch_gz_output_filename}`
echo ${s3_ls_result}
if [[ "${s3_ls_result}" ]]; then
echo "Dataset already exists."
exit 0
fi
echo "Fetch gh files..."
wget -nv --continue https://data.gharchive.org/${year}-${month}-{01..31}-{0..23}.json.gz
echo "Build dataset for month ${year}-${month}..."
# Build monthly dataset for clickhouse
find . -name "${year}-${month}*.json.gz" \
| sort -t "-" -nk4 \
| sort -t "-" -nk3 -s \
| xargs gzip -cd \
| jq -c '
{
id: (.id|tonumber),
event_type: .type,
actor_login: (.actor.login? // .actor_attributes.login? // (.actor | strings) // null),
repo_name: (.repo.name? // (.repository.owner? + "/" + .repository.name?) // null),
created_at:(.created_at|fromdate),
action: .payload.action,
number: (.payload.issue.number? // .payload.pull_request.number? // .payload.number? // null),
title: (.payload.issue.title? // .payload.pull_request.title? // null),
labels: ([.payload.issue.labels?[]?.name // .payload.pull_request.labels?[]?.name]),
ref: (.payload.ref? // null),
additions: (.payload.pull_request.additions? // null),
deletions: (.payload.pull_request.deletions? // null),
commit_id: (.payload.comment.commit_id? // null),
body:(.payload.review.body // .payload.comment.body // .payload.issue.body? // .payload.pull_request.body? // .payload.release.body? // null)
}' \
> ${ch_json_output_filename}
# Build monthly dataset for quickwit and upload directly.
echo "Build text-only dataset..."
cat ${ch_json_output_filename} | jq -c '. | select( .body != null )' | gzip > ${quickwit_gz_output_filename}
aws s3 mv ${quickwit_gz_output_filename} s3://quickwit-datasets-public/gh-archive/${quickwit_gz_output_filename}
# Upload ch dataset.
gzip ${ch_json_output_filename}
aws s3 mv ${ch_gz_output_filename} s3://quickwit-datasets-public/gh-archive/${ch_gz_output_filename}
# Remove gh-archive files
rm ${year}-${month}-*-*.json.gz
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment