This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
s3 = boto3.resource("s3") | |
my_bucket = s3.Bucket("s-eai-neox") | |
file_paths = [] | |
for my_bucket_object in my_bucket.objects.filter(Prefix="data/codepile/group1/"): | |
# print(my_bucket_object.key) | |
file_paths.append(f"s3a://s-eai-neox/{my_bucket_object.key}") | |
print(len(file_paths)) | |
from spark_session_builder import build_spark_session | |
file_paths = file_paths[100:200] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import os | |
from pyspark.ml import Pipeline | |
from pyspark.ml.feature import RegexTokenizer, NGram, HashingTF, MinHashLSH | |
from pyspark.sql.functions import col | |
from spark_session_builder import build_spark_session | |
spark = build_spark_session("spark://cpu64-dy-c6i-16xlarge-1:7077", 32, 128) | |
db = spark.read.parquet("/fsx/shared/pilev2_parquet/StackExchange_ver4_non_local_dedupped/dataset.parquet").limit(1_000_000) # Stage 0 & 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use arrow::{ | |
file::{writer::FileWriter, write_all, Writer}, | |
record_batch::RecordBatch, | |
util::hash::XXHash64, | |
}; | |
use std::fs::File; | |
fn hash_text_column(input_path: &str, output_path: &str) { | |
let mut input_reader = FileReader::try_new(input_path).unwrap(); | |
let input_schema = input_reader.schema().clone(); |