Created
March 13, 2024 20:52
-
-
Save harrytormey/4d3aa3906061b6b9ec149a43f98b2c3d to your computer and use it in GitHub Desktop.
How to iterate over data in parquet file from hugging face, example uses SWEBench data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
df = pd.read_parquet("./test-00000-of-00001-dc7762b94638c186.parquet") | |
# Filter the DataFrame for entries from the Django repo | |
django_entries = df[df["repo"].str.contains("Django", case=False)] | |
if not django_entries.empty: | |
for index, row in django_entries.iterrows(): | |
print(row["problem_statement"], "\n") | |
created_at_datetime = pd.to_datetime(row["created_at"]) | |
created_at_readable = created_at_datetime.strftime("%Y-%m-%d %H:%M:%S") | |
print(f"Created At: {created_at_readable}") | |
# Assess the size of the patch | |
patch_lines = ( | |
row["patch"].count("\n") + 1 | |
) # +1 to count the last line if it doesn't end with a newline | |
print(f"Patch Size: {patch_lines} lines\n") | |
else: | |
print("No entries from the Django repository were found.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment