Skip to content

Instantly share code, notes, and snippets.

View ncoop57's full-sized avatar
🤓
I'm a nerd.

Nathan Cooper ncoop57

🤓
I'm a nerd.
View GitHub Profile
import boto3
s3 = boto3.resource("s3")
my_bucket = s3.Bucket("s-eai-neox")
file_paths = []
for my_bucket_object in my_bucket.objects.filter(Prefix="data/codepile/group1/"):
# print(my_bucket_object.key)
file_paths.append(f"s3a://s-eai-neox/{my_bucket_object.key}")
print(len(file_paths))
from spark_session_builder import build_spark_session
file_paths = file_paths[100:200]
@ncoop57
ncoop57 / minhash_stackexchange.py
Last active January 26, 2023 07:57
Pyspark Minhash
import time
import os
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, NGram, HashingTF, MinHashLSH
from pyspark.sql.functions import col
from spark_session_builder import build_spark_session
spark = build_spark_session("spark://cpu64-dy-c6i-16xlarge-1:7077", 32, 128)
db = spark.read.parquet("/fsx/shared/pilev2_parquet/StackExchange_ver4_non_local_dedupped/dataset.parquet").limit(1_000_000) # Stage 0 & 1
use arrow::{
file::{writer::FileWriter, write_all, Writer},
record_batch::RecordBatch,
util::hash::XXHash64,
};
use std::fs::File;
fn hash_text_column(input_path: &str, output_path: &str) {
let mut input_reader = FileReader::try_new(input_path).unwrap();
let input_schema = input_reader.schema().clone();