Pierce Lamb piercelamb

## copy_to_s3.py
def copy_to_s3(
        bucket: Bucket,
        files_to_copy: List[str],
        row: NamedTuple,
        s3_artifact_path: str,
        process_func: Optional[Callable[[str, List, str, bool], None]]=None,
        reload: bool=False
):

    guid = row.file_location

## copy_s3_data_in_parallel.py
def copy_s3_data_in_parallel(
        df: pd.DataFrame,
        bucket: str,
        raw_training_data_paths: Dict[str, List[str]],
        s3_artifact_path: str,
        num_processes: int,
        process_func: Optional[Callable[[str, List, str, bool], None]]=None,
        reload: bool=False):
    existing_artifacts_state = get_raw_data_paths(bucket, path_to_filter_for=s3_artifact_path)
    split_df = np.array_split(df, num_processes)

## inference_folder_structure
EXP-3333-longformer/
data/
	reconciled_artifacts/
		<all raw training data>
	prepared_data/
		<all encoded training data>
run_1/
	tuning/ (or training/)
		<all files emitted by training>
	inference/

## process_statistics.py
def process_statistics(model_containers, config):
    for model_name, model_data in model_containers.items():
        run_statistics = {}
        for label, counts in model_data['stats'].items():
            recall_denom = counts['false_negatives'] + counts['true_positives']
            precision_denom = counts['false_positives'] + counts['true_positives']
            if (recall_denom > 0) and (precision_denom > 0):
                precision = counts['true_positives'] / precision_denom
                recall = counts['true_positives'] / recall_denom
                if precision + recall > 0:

## get_multi_class_statistics.py
def get_multi_class_stats(statistics_counts, predicted_label_id, actual_label_id, id2label):
    actual_label = id2label[actual_label_id]
    statistics_counts[actual_label]['total'] += 1
    predicted_label = id2label[predicted_label_id]
    if predicted_label == actual_label:
        statistics_counts[actual_label]['true_positives'] += 1
    else:
        # wrong prediction, the prediction is thus a false positive
        # and the actual label is a false negative
        statistics_counts[predicted_label]['false_positives'] += 1

## get_statistics.py
           predicted_label_id = str(result[0])
            if config.is_comparison:
                predictions[model_name] = id2label[predicted_label_id]
            print(f"{model_name} Predicted Label: {str(id2label[predicted_label_id])}")
            f1_metric = model_data['metrics']['f1']
            acc_metric = model_data['metrics']['acc']
            f1_metric.add_batch(
                predictions=[int(predicted_label_id)],
                references=[int(actual_label_id)]
            )

## collect_inference.py
if config.is_comparison:
      comparison_stats = init_comparison_stats(id2label, config)
  for i, raw_instance in enumerate(test_data):
      print("Testing Artifact: "+str(i+1))
      actual_label_id = str(raw_instance['labels'].item())
      ground_truth_label = id2label[actual_label_id]
      print("\n--------------------------------------------")
      print("Ground Truth Label: " + ground_truth_label)
      if config.is_comparison:
          comparison_stats[ground_truth_label]['count'] += 1

## test_data.py
test_data = load_from_disk(SAGEMAKER_LOCAL_INFERENCE_DATA_DIR)
print(f"Running inference over {test_data.num_rows} samples")
test_data.set_format(type="torch")

## init_metrics.py
def init_metrics():
    return evaluate.load("f1"), evaluate.load("accuracy")

## init_model_stats.py
def init_model_stats(id2label):
    return {
        label: {
            'total': 0,
            'true_positives': 0,
            'false_positives': 0,
            'false_negatives': 0,
            'accuracy': 0.0,
            'f1': 0.0
        }
	def copy_to_s3(
	bucket: Bucket,
	files_to_copy: List[str],
	row: NamedTuple,
	s3_artifact_path: str,
	process_func: Optional[Callable[[str, List, str, bool], None]]=None,
	reload: bool=False
	):

	guid = row.file_location
	def copy_s3_data_in_parallel(
	df: pd.DataFrame,
	bucket: str,
	raw_training_data_paths: Dict[str, List[str]],
	s3_artifact_path: str,
	num_processes: int,
	process_func: Optional[Callable[[str, List, str, bool], None]]=None,
	reload: bool=False):
	existing_artifacts_state = get_raw_data_paths(bucket, path_to_filter_for=s3_artifact_path)
	split_df = np.array_split(df, num_processes)
	EXP-3333-longformer/
	data/
	reconciled_artifacts/
	<all raw training data>
	prepared_data/
	<all encoded training data>
	run_1/
	tuning/ (or training/)
	<all files emitted by training>
	inference/
	def process_statistics(model_containers, config):
	for model_name, model_data in model_containers.items():
	run_statistics = {}
	for label, counts in model_data['stats'].items():
	recall_denom = counts['false_negatives'] + counts['true_positives']
	precision_denom = counts['false_positives'] + counts['true_positives']
	if (recall_denom > 0) and (precision_denom > 0):
	precision = counts['true_positives'] / precision_denom
	recall = counts['true_positives'] / recall_denom
	if precision + recall > 0:
	def get_multi_class_stats(statistics_counts, predicted_label_id, actual_label_id, id2label):
	actual_label = id2label[actual_label_id]
	statistics_counts[actual_label]['total'] += 1
	predicted_label = id2label[predicted_label_id]
	if predicted_label == actual_label:
	statistics_counts[actual_label]['true_positives'] += 1
	else:
	# wrong prediction, the prediction is thus a false positive
	# and the actual label is a false negative
	statistics_counts[predicted_label]['false_positives'] += 1
	predicted_label_id = str(result[0])
	if config.is_comparison:
	predictions[model_name] = id2label[predicted_label_id]
	print(f"{model_name} Predicted Label: {str(id2label[predicted_label_id])}")
	f1_metric = model_data['metrics']['f1']
	acc_metric = model_data['metrics']['acc']
	f1_metric.add_batch(
	predictions=[int(predicted_label_id)],
	references=[int(actual_label_id)]
	)
	if config.is_comparison:
	comparison_stats = init_comparison_stats(id2label, config)
	for i, raw_instance in enumerate(test_data):
	print("Testing Artifact: "+str(i+1))
	actual_label_id = str(raw_instance['labels'].item())
	ground_truth_label = id2label[actual_label_id]
	print("\n--------------------------------------------")
	print("Ground Truth Label: " + ground_truth_label)
	if config.is_comparison:
	comparison_stats[ground_truth_label]['count'] += 1
	test_data = load_from_disk(SAGEMAKER_LOCAL_INFERENCE_DATA_DIR)
	print(f"Running inference over {test_data.num_rows} samples")
	test_data.set_format(type="torch")
	def init_metrics():
	return evaluate.load("f1"), evaluate.load("accuracy")
	def init_model_stats(id2label):
	return {
	label: {
	'total': 0,
	'true_positives': 0,
	'false_positives': 0,
	'false_negatives': 0,
	'accuracy': 0.0,
	'f1': 0.0
	}