Skip to content

Instantly share code, notes, and snippets.

@colin-ho
Created April 21, 2025 20:32
Show Gist options
  • Save colin-ho/1426f1033ec1eca4488c1a3d0de357ba to your computer and use it in GitHub Desktop.
Save colin-ho/1426f1033ec1eca4488c1a3d0de357ba to your computer and use it in GitHub Desktop.
# Configure Daft to use a Ray cluster
daft.set_runner_ray(address="ray://my-cluster:10001")
def extract_commits_from_logs(logs):
...
# Clone repos and extract commits in parallel
@daft.udf(return_type=...)
def clone_and_extract_commits(repo_urls):
parsed_logs = []
for repo_url in repo_urls:
with tempfile.TemporaryDirectory() as temp_dir:
# Clone the repo
repo = git.Repo.clone_from(
repo_url, to_path=temp_dir, multi_options=["--no-checkout"]
)
# Get git logs
logs = repo.git.log(
"--pretty=format:---COMMIT START---%n%H%n%an%n%ae%n%ai%n%B%n---COMMIT END---",
"--date=iso",
"--numstat",
)
# Extract commits from logs
commits = extract_commits_from_logs(logs)
parsed_logs.extend(commits)
return parsed_logs
# UDFs
df = df.with_column("commits", clone_and_extract_commits(df["repo_url"]))
df.write_parquet("commits")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment