Skip to content

Instantly share code, notes, and snippets.

@aymanfarhat
Last active August 9, 2022 12:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aymanfarhat/456cbf52adb9b252881e40f855cefb6d to your computer and use it in GitHub Desktop.
Save aymanfarhat/456cbf52adb9b252881e40f855cefb6d to your computer and use it in GitHub Desktop.
Example of extracting and flattening git logs and writing them into BigQuery
#Copyright 2022 Google LLC.
#SPDX-License-Identifier: Apache-2.0
bq --location="EU" load \
--replace \
--source_format="NEWLINE_DELIMITED_JSON" \
--autodetect \
your_dataset.your_table \
./logs.jsonl
{"commit_hash": "1e0e4b74e2fad3a", "commit_subject": "Example commit 1", "author_name": "Author 1", "author_email": "author1@users.noreply.github.com", "author_date": "1655894118", "commiter_email": "noreply@github.com", "commiter_name": "GitHub", "file": "tools/sometool1/file.py\n"}
{"commit_hash": "e551dd384acc13f", "commit_subject": "Example commit 2", "author_name": "Author 2", "author_email": "author2@gmail.com", "author_date": "1655821884", "commiter_email": "noreply@github.com", "commiter_name": "GitHub", "file": "tools/sometool2/path/config.yaml\n"}
{"commit_hash": "f96748b7b3b8f54", "commit_subject": "Example commit 3", "author_name": "Author 3", "author_email": "author3@users.noreply.github.com", "author_date": "1655797941", "commiter_email": "noreply@github.com", "commiter_name": "GitHub", "file": "tools/sometool3/README.md\n"}
{"commit_hash": "f96748b7b3b8f54", "commit_subject": "Example commit 3", "author_name": "Author 3", "author_email": "author3@users.noreply.github.com", "author_date": "1655797941", "commiter_email": "noreply@github.com", "commiter_name": "GitHub", "file": "tools/sometool3/src/main/java/functions/SendNotification.java\n"}
{"commit_hash": "f96748b7b3b8f54", "commit_subject": "Example commit 3", "author_name": "Author 3", "author_email": "author3@users.noreply.github.com", "author_date": "1655797941", "commiter_email": "noreply@github.com", "commiter_name": "GitHub", "file": "tools/sometool3/src/main/java/functions/eventpojos/PubSubMessage.java\n"}
{"commit_hash": "f96748b7b3b8f54", "commit_subject": "Example commit 3", "author_name": "Author 3", "author_email": "author3@users.noreply.github.com", "author_date": "1655797941", "commiter_email": "noreply@github.com", "commiter_name": "GitHub", "file": "tools/sometool3/src/test/java/functions/SendNotificationTest.java\n"}
#Copyright 2022 Google LLC.
#SPDX-License-Identifier: Apache-2.0
python extract.py \
--git-dir='/path/to/your/repository/.git' \
--branch='main' \
--output='./logs.jsonl'
#Copyright 2022 Google LLC.
#SPDX-License-Identifier: Apache-2.0
"""
A simple script for extracting and flattening git logs per commit into:
file, commit, subject, author, commiter, date. formatted as JSONL,
compatible for loading into BigQuery
"""
import io
import json
import re
import argparse
import subprocess
def get_logs(git_dir, fields, branch):
"""
Runs a git log command and returns output results line by line
"""
pretty_format = '%n'.join([f'{k}: {v}' for k, v in fields])
command = [
'git', '--git-dir', git_dir, 'log', f'origin/{branch}', '--name-only',
f'--pretty=format:{pretty_format}'
]
proc = subprocess.Popen(command, stdout=subprocess.PIPE)
for line in io.TextIOWrapper(proc.stdout, encoding='utf-8'):
yield line
def parse_logs(logs, fields):
"""
Iterate through a git log output line by line, flatten by yielding
a log item for every file in each commit. Each new log item is a
dict containing file name, author, commit hash, commit date etc...
"""
state = {k: None for k, v in fields}
file_path_pattern = re.compile(r'^([a-zA-Z_\-\s0-9]+)(\/[a-zA-Z_\-\s0-9\.]+)+(\.[a-zA-Z]+)?$')
commit_detail_pattern = re.compile(r'^(?P<type>.*): (?P<value>.*)')
for line in logs:
if file_path_pattern.match(line):
temp_out = dict(state)
temp_out['file'] = line
yield temp_out
else:
matches = commit_detail_pattern.match(line)
if matches:
groups = matches.groups()
k = groups[0].strip()
if k in state:
state[k] = groups[1].strip()
def main():
parser = argparse.ArgumentParser(
description='Utility to extract git logs into jsonl')
parser.add_argument('--git-dir',
required=True,
help='Git directory to read logs from')
parser.add_argument('--branch',
required=True,
help='Git branch to read logs from')
parser.add_argument('--output',
required=True,
help='Target file to write transformed logs to')
args = parser.parse_args()
fields = [('commit_hash', '%H'), ('commit_subject', '%s'),
('author_name', '%aN'), ('author_email', '%aE'),
('author_date', '%at'), ('commiter_email', '%ce'),
('commiter_name', '%cn')]
logs = get_logs(args.git_dir, fields, args.branch)
parsed_logs = parse_logs(logs, fields)
with open(args.output, mode='w', encoding='UTF-8') as file:
for line in parsed_logs:
file.write(f'{json.dumps(line)}\n')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment