Nathan Cooper ncoop57

## pyspark_split_group.py
import boto3
s3 = boto3.resource("s3")
my_bucket = s3.Bucket("s-eai-neox")
file_paths = []
for my_bucket_object in my_bucket.objects.filter(Prefix="data/codepile/group1/"):
    # print(my_bucket_object.key)
    file_paths.append(f"s3a://s-eai-neox/{my_bucket_object.key}")
print(len(file_paths))
from spark_session_builder import build_spark_session
file_paths = file_paths[100:200]

## minhash_stackexchange.py
import time
import os

from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, NGram, HashingTF, MinHashLSH
from pyspark.sql.functions import col
from spark_session_builder import build_spark_session

spark = build_spark_session("spark://cpu64-dy-c6i-16xlarge-1:7077", 32, 128)
db = spark.read.parquet("/fsx/shared/pilev2_parquet/StackExchange_ver4_non_local_dedupped/dataset.parquet").limit(1_000_000) # Stage 0 & 1

## hash_arrow.rs
use arrow::{
    file::{writer::FileWriter, write_all, Writer},
    record_batch::RecordBatch,
    util::hash::XXHash64,
};
use std::fs::File;

fn hash_text_column(input_path: &str, output_path: &str) {
    let mut input_reader = FileReader::try_new(input_path).unwrap();
    let input_schema = input_reader.schema().clone();
	import boto3
	s3 = boto3.resource("s3")
	my_bucket = s3.Bucket("s-eai-neox")
	file_paths = []
	for my_bucket_object in my_bucket.objects.filter(Prefix="data/codepile/group1/"):
	# print(my_bucket_object.key)
	file_paths.append(f"s3a://s-eai-neox/{my_bucket_object.key}")
	print(len(file_paths))
	from spark_session_builder import build_spark_session
	file_paths = file_paths[100:200]
	import time
	import os

	from pyspark.ml import Pipeline
	from pyspark.ml.feature import RegexTokenizer, NGram, HashingTF, MinHashLSH
	from pyspark.sql.functions import col
	from spark_session_builder import build_spark_session

	spark = build_spark_session("spark://cpu64-dy-c6i-16xlarge-1:7077", 32, 128)
	db = spark.read.parquet("/fsx/shared/pilev2_parquet/StackExchange_ver4_non_local_dedupped/dataset.parquet").limit(1_000_000) # Stage 0 & 1
	use arrow::{
	file::{writer::FileWriter, write_all, Writer},
	record_batch::RecordBatch,
	util::hash::XXHash64,
	};
	use std::fs::File;

	fn hash_text_column(input_path: &str, output_path: &str) {
	let mut input_reader = FileReader::try_new(input_path).unwrap();
	let input_schema = input_reader.schema().clone();