Skip to content

Instantly share code, notes, and snippets.

@wesslen
Created February 15, 2024 18:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wesslen/cd96ccaa3574b388c8b17db02035f7ce to your computer and use it in GitHub Desktop.
Save wesslen/cd96ccaa3574b388c8b17db02035f7ce to your computer and use it in GitHub Desktop.
Check for duplicate inputs (text) for .jsonl files
import json
import logging
import typer
from pathlib import Path
from typing import List
from prodigy import set_hashes
logging.basicConfig(format='%(message)s', level=logging.INFO)
def process_file(file_path: Path, all_hashes: dict) -> dict:
unique_count = 0
dup_count = 0
file_hashes = set()
with file_path.open('r', encoding='utf-8') as f:
records = [json.loads(line) for line in f]
raw_hashes = []
for record in records:
record = set_hashes(record)
raw_hashes.append(record["_input_hash"])
if record["_input_hash"] not in all_hashes:
all_hashes[record["_input_hash"]] = {"text": record["text"], "meta": {"file": [str(file_path)]}}
unique_count += 1
else:
all_hashes[record["_input_hash"]]["meta"]["file"].append(str(file_path))
dup_count += 1
file_hashes.add(record["_input_hash"])
self_dup_count = len(raw_hashes) - len(file_hashes)
return {"raw_count": len(raw_hashes), "unique_count": len(file_hashes), "dup_count": dup_count, "self_dup_count": self_dup_count}
def main(directory: Path):
typer.echo(f"Processing files in directory: {directory}")
typer.echo(" ")
all_hashes = {}
output_data = []
for file_path in directory.glob('*.jsonl'):
stats = process_file(file_path, all_hashes)
logging.info(f"{file_path.name} includes {stats['raw_count']} raw documents, {stats['unique_count']} unique documents, {stats['self_dup_count']} were duplicates found within this file and {stats['dup_count'] - stats['self_dup_count']} found to be duplicated in other files")
logging.info(" ")
for _, record in all_hashes.items():
if len(record["meta"]["file"]) > 1:
output_data.append(record)
output_path = directory / 'duplicates.jsonl'
with output_path.open('w', encoding='utf-8') as f:
for record in output_data:
f.write(json.dumps(record) + '\n')
typer.echo(f"Completed processing. Duplicates written to {output_path}")
if __name__ == "__main__":
typer.run(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment