Skip to content

Instantly share code, notes, and snippets.

@D4Vinci
Created June 15, 2022 20:01
Show Gist options
  • Save D4Vinci/3fb2adfac84d9d2fe46d4cbff7fc1e80 to your computer and use it in GitHub Desktop.
Save D4Vinci/3fb2adfac84d9d2fe46d4cbff7fc1e80 to your computer and use it in GitHub Desktop.
Using pandas to convert `.json-lines` file to `.csv` fast but using more memory
import pandas
#
## Settings ##
input_file = "13441370_meta.json-lines" # Have to be a .json-lines files
output_file = "All_reviews.csv" # Have to be a .csv file
new_columns = ( # The new file columns
'product_name',
'product_brand',
'product_category',
'product_code',
'product_url',
'product_pros',
'product_cons',
'average_build_quality',
'average_ease_of_use',
'average_features',
'average_rating',
'average_value_for_money',
'build_quality_score',
'ease_of_use_score',
'features_score',
'number_of_reviews',
'purchase_date',
'review_date',
'review_score',
'reviewer_location',
'reviewer_name',
'value_for_money_score',
)
##
#
print(f"[+] Reading input file {input_file}...")
df = pandas.read_json(input_file, lines=True)
print(f"[+] Found {len(df)} lines")
column_number = 0
new_df = pandas.DataFrame([], columns=new_columns)
print("[+] Starting...")
# Here we map columns with different names
for new_column, old_column in [("product_name", "name",), ("product_code", "identifier",), ("product_brand", "brand",), ("product_url", "url",)]:
new_df[new_column] = df.apply(lambda row: row[old_column], axis=1)
column_number += 1
print(f"[~] Column {column_number}/{len(new_columns)} done\r", end="", flush=True)
# Here's a column that needs something to be done on the value
new_df["product_category"] = df.apply(lambda row: ">".join(row["category"]), axis=1)
column_number += 1
print(f"[~] Column {column_number}/{len(new_columns)} done\r", end="", flush=True)
# Maping columns from metadata with some checks on them
for col in ("average_rating", "number_of_reviews", "average_ease_of_use", "average_features", "average_build_quality", "average_value_for_money", "review_date", "purchase_date", "review_score", "reviewer_name", "reviewer_location", "product_pros", "product_cons", "ease_of_use_score", "features_score", "build_quality_score", "value_for_money_score",):
new_df[col] = df.apply( lambda row: row["metadata"][col] if row["metadata"][col] and row["metadata"][col] not in ('None', "none") else "", axis=1)
column_number += 1
print(f"[~] Column {column_number}/{len(new_columns)} done\r", end="", flush=True)
#
with open(output_file, 'w', encoding="utf-8") as f:
new_df.to_csv(f, encoding='utf-8', chunksize=100000, index=False, header=True, line_terminator='\n', escapechar='\\')
print(f"\n[+] Saved new file to {output_file}")
@D4Vinci
Copy link
Author

D4Vinci commented Jun 15, 2022

To use less memory or to run the script on a low-spec machine you will need to read the input file in chunks and write it in chunks but it will take a lot more time, depending on the size of the chunk. I can create that version if anyone wants it.

@D4Vinci
Copy link
Author

D4Vinci commented Jun 15, 2022

pandas apply is the fastest option too, check this https://realpython.com/fast-flexible-pandas/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment