matthiasa4/vertex_ai_search_tuning_file_checks.py

## vertex_ai_search_tuning_file_checks.py
import sys
import jsonlines
import pandas as pd


def jsonl_to_df(file_path):
    data = []
    with jsonlines.open(file_path) as reader:
        for line in reader:
            # Process each line of the JSONL file
            data.append(line)
    return pd.DataFrame(data)


def prep_full_dataset(query_data: pd.DataFrame, corpus_data: pd.DataFrame,
                      scoring_data: pd.DataFrame) -> bool:
    # If it appears in Scoring data, it must have a corresponding segment in the Corpus data
    # If it appears in the Corpus data, it doesn't necessarily need to appear in the Scoring data
    # Hence we left join scoring data to corpus data
    corpus_scoring_merged = pd.merge(corpus_data,
                                     scoring_data,
                                     left_on='corpus-id',
                                     right_on='corpus-id-scoring',
                                     how='left')

    # If it appears in Scoring data, it must have a corresponding segment in the Query data
    # If it appears in the Query data, it doesn't necessarily need to appear in the Scoring data
    # Hence we left join scoring data to query data
    full_dataset = pd.merge(corpus_scoring_merged,
                            query_data,
                            left_on='query-id-scoring',
                            right_on='query-id',
                            how='left')

    return full_dataset


def check_training_queries(query_data: pd.DataFrame) -> bool:
    """
    [Training queries](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)

    Provide at least 100.
    """
    return query_data.shape[0] > 100


def check_extractive_segments(full_dataset: pd.DataFrame) -> bool:
    """
    [Extractive segments](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)

    You must provide two types of extractive segments:

    - Segments that contain relevant information needed to answer the training queries. These are segments that have positive matching with queries.

    - Segments that are not associated with any training queries. These segments are used as random negatives in the model tuning.

    Provide at least one extractive segment per query and at least 10,000 additional extractive segments.
    """
    # Check if there is at least one extractive segment per query
    extractive_segments_per_query = full_dataset.groupby(
        'query-id').size().min() > 0
    print(
        f"|___ Subcheck: At least one extractive segment per query: {get_result(extractive_segments_per_query)}"
    )

    # Check if there are at least 10,000 additional extractive segments
    additional_extractive_segments = full_dataset[full_dataset['score'] ==
                                                  0].shape[0] > 10000
    print(
        f"|___ Subcheck: At least 10 000 additional extractive segments: {get_result(additional_extractive_segments)}"
    )

    return extractive_segments_per_query and additional_extractive_segments


def check_relevance_scores(full_dataset: pd.DataFrame) -> bool:
    """
    [Relevance scores](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)

    Provide at least 100 relevant scores and, optionally, additional non-relevant scores.
    """
    return full_dataset[full_dataset['score'] > 0].shape[0] > 100


def check_datasets(query_data: pd.DataFrame, corpus_data: pd.DataFrame,
                   scoring_data: pd.DataFrame) -> bool:
    corpus_scoring_merged = pd.merge(corpus_data,
                                     scoring_data,
                                     left_on='corpus-id',
                                     right_on='corpus-id-scoring',
                                     how='outer')

    full_dataset = pd.merge(corpus_scoring_merged,
                            query_data,
                            left_on='query-id-scoring',
                            right_on='query-id',
                            how='outer')

    print(
        f"Number of segments in Corpus file that don't have a match in Scoring file: {full_dataset[~full_dataset['corpus-id'].isna() & full_dataset['corpus-id-scoring'].isna()].shape[0]}"
    )
    print(
        f"Number of segments in Scoring file that don't have a match in Corpus file: {full_dataset[full_dataset['corpus-id'].isna() & ~full_dataset['corpus-id-scoring'].isna()].shape[0]}"
    )
    print(
        f"Number of queries in Query file that don't have a match in Scoring file: {full_dataset[~full_dataset['query-id'].isna() & full_dataset['query-id-scoring'].isna()].shape[0]}"
    )
    print(
        f"Number of queries in Scoring file that don't have a match in Query file: {full_dataset[full_dataset['query-id'].isna() & ~full_dataset['query-id-scoring'].isna()].shape[0]}"
    )
    print()


def check_corpus_file(full_dataset: pd.DataFrame) -> bool:
    """
    The [corpus file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#corpus) contains extractive segments: segments that contain information to answer the queries in the query file and many additional segments to be used as random negatives when tuning the model. You should have at least 100 segments that contain query answers; queries can be answered by multiple segments. You should also have at least 10,000 random segments.
    """
    # Check if there are at least 100 segments that contain query answers
    query_answers = full_dataset[~full_dataset['query-id'].isna(
    )].shape[0] > 100
    print(f"|___ Subcheck: At least 100 segments that contain query answers: {get_result(query_answers)}")

    # Check if there are at least 10,000 random segments
    random_segments = full_dataset[
        full_dataset['query-id'].isna()].shape[0] > 10000
    print(f"|___ Subcheck: At least 10 000 random segments: {get_result(random_segments)}")

    zero_score_segments = full_dataset[full_dataset['score'] ==
                                       0.0].shape[0] > 10000
    print(f"|___ Subcheck: At least 10 000  segments with 0 as score: {get_result(zero_score_segments)}")

    return query_answers and random_segments


def check_query_file(full_dataset: pd.DataFrame) -> bool:
    """
    The [query file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#training-files) contains the example queries that will be used for tuning the model. Each query should have one or more corresponding extractive segments in the corpus file. You should provide at least 100 positive match queries. You can also provide non-relevant queries: these are queries that correspond to extractive segments with a relevance score of zero.
    """
    min_one_extractive_segment_per_query = full_dataset[
        ~full_dataset['query-id'].isna()
        & ~full_dataset['corpus-id'].isna()].groupby(
            'query-id-scoring').size().min() > 0

    positive_match_queries = full_dataset[full_dataset['score'] >
                                          0].shape[0] > 100

    return min_one_extractive_segment_per_query and positive_match_queries


def check_training_labels(query_data: pd.DataFrame,
                          scoring_data: pd.DataFrame) -> bool:
    """
    The [training labels file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#training) connects the queries with the extractive segments and scores each query and segment pair.

    If the test labels file is not present, then 20% of the queries in the training labels file are reserved for evaluating the tuned model after training.

    The file contains the ID of a query and the ID of its matching (or non-matching) extractive segment and a score for the relevance of the segment to the query. **There must be at least one line per query**; if a query is answered by two segments, then there are two lines for that query. Score is a non-negative integer value. Any score greater than zero indicates that the document is related to the query. Larger numbers indicate a greater level of relevance. If the score is omitted, the default value is 1.

    The training labels file is a TSV (tab-separated values) file with a header. The file must have the columns query-id, corpus-id and score. The query-id is a string that matches the _id key from the query file, and the corpus-id is a string that matches the _id in the corpus file.

    Extracted requirements:
    - There must be at least one line per query.
    - Score is a non-negative integer value.
    """
    query_ids = set(query_data['query-id'])
    scoring_ids = set(scoring_data['query-id-scoring'])
    same_items = query_ids == scoring_ids

    print(f"|___ Subcheck: Same ids in query and scoring data: {get_result(same_items)}")

    score_values = scoring_data['score']
    non_negative_integers = all(
        isinstance(score, int) and score >= 0 for score in score_values)
    print(
        f"|___ Subcheck: Column 'score' contains non-negative integer values: {get_result(non_negative_integers)}"
    )

    return same_items and non_negative_integers


def get_result(result: bool) -> str:
    if result:
        return "✅ met"
    else:
        return "❌ not met"


def main():
    if len(sys.argv) != 4:
        print(
            "Usage: python check.py <corpus_path> <query_path> <scoring_path>")
        return

    corpus_path = sys.argv[1]
    query_path = sys.argv[2]
    scoring_path = sys.argv[3]

    # Read the query file
    query_data = jsonl_to_df(query_path)
    query_data = query_data.rename(columns={
        '_id': 'query-id',
        'text': 'query-text'
    })

    # Read the corpus file
    corpus_data = jsonl_to_df(corpus_path)
    corpus_data = corpus_data.rename(columns={
        '_id': 'corpus-id',
        'text': 'corpus-text'
    })

    # Read the scoring file
    scoring_data = pd.read_csv(scoring_path, sep='\t')
    scoring_data = scoring_data.rename(
        columns={
            'corpus-id': 'corpus-id-scoring',
            'query-id': 'query-id-scoring',
            'score': 'score'
        })


    print("\nGeneral dataset checks\n----------------------")
    check_datasets(query_data, corpus_data, scoring_data)

    full_dataset = prep_full_dataset(query_data, corpus_data, scoring_data)

    print("\nDocumentation dataset checks\n----------------------------")
    print(
        f"Training query requirements met: {get_result(check_training_queries(query_data))}"
    )
    print(
        f"Extractive segment requirements met: {get_result(check_extractive_segments(full_dataset))}"
    )
    print(
        f"Relevance score requirements met: {get_result(check_relevance_scores(full_dataset))}"
    )

    print(
        f"Corpus file requirements met: {get_result(check_corpus_file(full_dataset))}"
    )
    print(
        f"Query file requirements met: {get_result(check_query_file(full_dataset))}"
    )
    print(f"Training labels requirements met: {get_result(check_training_labels(query_data, scoring_data))}")

if __name__ == "__main__":
    main()
	import sys
	import jsonlines
	import pandas as pd


	def jsonl_to_df(file_path):
	data = []
	with jsonlines.open(file_path) as reader:
	for line in reader:
	# Process each line of the JSONL file
	data.append(line)
	return pd.DataFrame(data)


	def prep_full_dataset(query_data: pd.DataFrame, corpus_data: pd.DataFrame,
	scoring_data: pd.DataFrame) -> bool:
	# If it appears in Scoring data, it must have a corresponding segment in the Corpus data
	# If it appears in the Corpus data, it doesn't necessarily need to appear in the Scoring data
	# Hence we left join scoring data to corpus data
	corpus_scoring_merged = pd.merge(corpus_data,
	scoring_data,
	left_on='corpus-id',
	right_on='corpus-id-scoring',
	how='left')

	# If it appears in Scoring data, it must have a corresponding segment in the Query data
	# If it appears in the Query data, it doesn't necessarily need to appear in the Scoring data
	# Hence we left join scoring data to query data
	full_dataset = pd.merge(corpus_scoring_merged,
	query_data,
	left_on='query-id-scoring',
	right_on='query-id',
	how='left')

	return full_dataset


	def check_training_queries(query_data: pd.DataFrame) -> bool:
	"""
	[Training queries](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)

	Provide at least 100.
	"""
	return query_data.shape[0] > 100


	def check_extractive_segments(full_dataset: pd.DataFrame) -> bool:
	"""
	[Extractive segments](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)

	You must provide two types of extractive segments:

	- Segments that contain relevant information needed to answer the training queries. These are segments that have positive matching with queries.

	- Segments that are not associated with any training queries. These segments are used as random negatives in the model tuning.

	Provide at least one extractive segment per query and at least 10,000 additional extractive segments.
	"""
	# Check if there is at least one extractive segment per query
	extractive_segments_per_query = full_dataset.groupby(
	'query-id').size().min() > 0
	print(
	f"\|___ Subcheck: At least one extractive segment per query: {get_result(extractive_segments_per_query)}"
	)

	# Check if there are at least 10,000 additional extractive segments
	additional_extractive_segments = full_dataset[full_dataset['score'] ==
	0].shape[0] > 10000
	print(
	f"\|___ Subcheck: At least 10 000 additional extractive segments: {get_result(additional_extractive_segments)}"
	)

	return extractive_segments_per_query and additional_extractive_segments


	def check_relevance_scores(full_dataset: pd.DataFrame) -> bool:
	"""
	[Relevance scores](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)

	Provide at least 100 relevant scores and, optionally, additional non-relevant scores.
	"""
	return full_dataset[full_dataset['score'] > 0].shape[0] > 100


	def check_datasets(query_data: pd.DataFrame, corpus_data: pd.DataFrame,
	scoring_data: pd.DataFrame) -> bool:
	corpus_scoring_merged = pd.merge(corpus_data,
	scoring_data,
	left_on='corpus-id',
	right_on='corpus-id-scoring',
	how='outer')

	full_dataset = pd.merge(corpus_scoring_merged,
	query_data,
	left_on='query-id-scoring',
	right_on='query-id',
	how='outer')

	print(
	f"Number of segments in Corpus file that don't have a match in Scoring file: {full_dataset[~full_dataset['corpus-id'].isna() & full_dataset['corpus-id-scoring'].isna()].shape[0]}"
	)
	print(
	f"Number of segments in Scoring file that don't have a match in Corpus file: {full_dataset[full_dataset['corpus-id'].isna() & ~full_dataset['corpus-id-scoring'].isna()].shape[0]}"
	)
	print(
	f"Number of queries in Query file that don't have a match in Scoring file: {full_dataset[~full_dataset['query-id'].isna() & full_dataset['query-id-scoring'].isna()].shape[0]}"
	)
	print(
	f"Number of queries in Scoring file that don't have a match in Query file: {full_dataset[full_dataset['query-id'].isna() & ~full_dataset['query-id-scoring'].isna()].shape[0]}"
	)
	print()


	def check_corpus_file(full_dataset: pd.DataFrame) -> bool:
	"""
	The [corpus file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#corpus) contains extractive segments: segments that contain information to answer the queries in the query file and many additional segments to be used as random negatives when tuning the model. You should have at least 100 segments that contain query answers; queries can be answered by multiple segments. You should also have at least 10,000 random segments.
	"""
	# Check if there are at least 100 segments that contain query answers
	query_answers = full_dataset[~full_dataset['query-id'].isna(
	)].shape[0] > 100
	print(f"\|___ Subcheck: At least 100 segments that contain query answers: {get_result(query_answers)}")

	# Check if there are at least 10,000 random segments
	random_segments = full_dataset[
	full_dataset['query-id'].isna()].shape[0] > 10000
	print(f"\|___ Subcheck: At least 10 000 random segments: {get_result(random_segments)}")

	zero_score_segments = full_dataset[full_dataset['score'] ==
	0.0].shape[0] > 10000
	print(f"\|___ Subcheck: At least 10 000 segments with 0 as score: {get_result(zero_score_segments)}")

	return query_answers and random_segments


	def check_query_file(full_dataset: pd.DataFrame) -> bool:
	"""
	The [query file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#training-files) contains the example queries that will be used for tuning the model. Each query should have one or more corresponding extractive segments in the corpus file. You should provide at least 100 positive match queries. You can also provide non-relevant queries: these are queries that correspond to extractive segments with a relevance score of zero.
	"""
	min_one_extractive_segment_per_query = full_dataset[
	~full_dataset['query-id'].isna()
	& ~full_dataset['corpus-id'].isna()].groupby(
	'query-id-scoring').size().min() > 0

	positive_match_queries = full_dataset[full_dataset['score'] >
	0].shape[0] > 100

	return min_one_extractive_segment_per_query and positive_match_queries


	def check_training_labels(query_data: pd.DataFrame,
	scoring_data: pd.DataFrame) -> bool:
	"""
	The [training labels file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#training) connects the queries with the extractive segments and scores each query and segment pair.

	If the test labels file is not present, then 20% of the queries in the training labels file are reserved for evaluating the tuned model after training.

	The file contains the ID of a query and the ID of its matching (or non-matching) extractive segment and a score for the relevance of the segment to the query. There must be at least one line per query; if a query is answered by two segments, then there are two lines for that query. Score is a non-negative integer value. Any score greater than zero indicates that the document is related to the query. Larger numbers indicate a greater level of relevance. If the score is omitted, the default value is 1.

	The training labels file is a TSV (tab-separated values) file with a header. The file must have the columns query-id, corpus-id and score. The query-id is a string that matches the _id key from the query file, and the corpus-id is a string that matches the _id in the corpus file.

	Extracted requirements:
	- There must be at least one line per query.
	- Score is a non-negative integer value.
	"""
	query_ids = set(query_data['query-id'])
	scoring_ids = set(scoring_data['query-id-scoring'])
	same_items = query_ids == scoring_ids

	print(f"\|___ Subcheck: Same ids in query and scoring data: {get_result(same_items)}")

	score_values = scoring_data['score']
	non_negative_integers = all(
	isinstance(score, int) and score >= 0 for score in score_values)
	print(
	f"\|___ Subcheck: Column 'score' contains non-negative integer values: {get_result(non_negative_integers)}"
	)

	return same_items and non_negative_integers


	def get_result(result: bool) -> str:
	if result:
	return "✅ met"
	else:
	return "❌ not met"


	def main():
	if len(sys.argv) != 4:
	print(
	"Usage: python check.py <corpus_path> <query_path> <scoring_path>")
	return

	corpus_path = sys.argv[1]
	query_path = sys.argv[2]
	scoring_path = sys.argv[3]

	# Read the query file
	query_data = jsonl_to_df(query_path)
	query_data = query_data.rename(columns={
	'_id': 'query-id',
	'text': 'query-text'
	})

	# Read the corpus file
	corpus_data = jsonl_to_df(corpus_path)
	corpus_data = corpus_data.rename(columns={
	'_id': 'corpus-id',
	'text': 'corpus-text'
	})

	# Read the scoring file
	scoring_data = pd.read_csv(scoring_path, sep='\t')
	scoring_data = scoring_data.rename(
	columns={
	'corpus-id': 'corpus-id-scoring',
	'query-id': 'query-id-scoring',
	'score': 'score'
	})


	print("\nGeneral dataset checks\n----------------------")
	check_datasets(query_data, corpus_data, scoring_data)

	full_dataset = prep_full_dataset(query_data, corpus_data, scoring_data)

	print("\nDocumentation dataset checks\n----------------------------")
	print(
	f"Training query requirements met: {get_result(check_training_queries(query_data))}"
	)
	print(
	f"Extractive segment requirements met: {get_result(check_extractive_segments(full_dataset))}"
	)
	print(
	f"Relevance score requirements met: {get_result(check_relevance_scores(full_dataset))}"
	)

	print(
	f"Corpus file requirements met: {get_result(check_corpus_file(full_dataset))}"
	)
	print(
	f"Query file requirements met: {get_result(check_query_file(full_dataset))}"
	)
	print(f"Training labels requirements met: {get_result(check_training_labels(query_data, scoring_data))}")

	if __name__ == "__main__":
	main()