Skip to content

Instantly share code, notes, and snippets.

Last active August 10, 2021 19:46
Show Gist options
  • Save rom1504/0b846b2dc64c5e0604e1d532c09cbff6 to your computer and use it in GitHub Desktop.
Save rom1504/0b846b2dc64c5e0604e1d532c09cbff6 to your computer and use it in GitHub Desktop.
Compute some stats on cah collection
First get the files with:
Then pip install pyspark
Then run this file. It also takes a few minutes
The main thing this script is doing is adding/removing/reordering csv columns and converting to fewer parquet files
The end result is easy to use in spark, pandas or anything else
from glob import glob
from multiprocessing import Pool
from collections import defaultdict
from pathlib import Path
def f(w):
return open(w, "r").readline().rstrip().split("|")
def main():
p = Pool(128)
# necessary because the schema changed
print("Retrieving columns of all csv files")
fs = [str(x) for x in Path('/media/hd/cah/drive').glob("**/*.csv")] + [str(x) for x in Path('/media/hd/cah/theeye/output/cah').glob("**/*.csv")]
headers =, fs)
all = list(zip(headers,fs))
print("Grouping files by columns")
d = defaultdict(list)
for cols, path in all:
print("Starting spark session")
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
# You can open http://localhost:4040 to follow progress on the spark operations
spark = SparkSession.builder.config("spark.driver.memory", "16G") .master("local[16]").appName('spark-stats').getOrCreate()
ref_cols = ['SAMPLE_ID','URL','TEXT','HEIGHT','WIDTH','LICENSE','NSFW','similarity']
total = None
print("Reading all collections of csv, removing, adding and reordering columns as needed")
for cols, paths in d.items():
cols = cols.split(",")
incols = [x for x in cols if x in ref_cols]
print("incols", incols)
w ="|", header=True).csv(paths).select(*incols)
addcols = [x for x in ref_cols if x not in cols]
print("addcols", addcols)
for c in addcols:
w = w.withColumn(c, lit(""))
w =*ref_cols)
if total is None:
total = w
total = total.union(w)
print("Casting columns to the right types")
total = total.withColumn("SAMPLE_ID", total["SAMPLE_ID"].cast("bigint"))
total = total.withColumn("WIDTH", total["WIDTH"].cast("int"))
total = total.withColumn("HEIGHT", total["HEIGHT"].cast("int"))
total = total.withColumn("similarity", total["similarity"].cast("double"))
print("Repartitionning and writing to 16 parquet files to cah_dataframe")
ok ="cah_dataframe")
print("Rereading the parquet and computing some basic stats")
print("Size of collection", ok.count())
uniques = ok.drop_duplicates(["URL", "TEXT"])
ok_unique ="cah_dataframe_unique")
print("Number of uniques", ok_unique.count())
Once you computed the parquet files with unique items,
let's compute more stats
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
def main():
spark = SparkSession.builder.config("spark.driver.memory", "16G") .master("local[16]").appName('spark-stats').getOrCreate()
df ="cah_dataframe_unique")
print("width quantiles", df.approxQuantile("WIDTH", [0.1*x for x in range(1,10)], 0.1))
print("height quantiles", df.approxQuantile("HEIGHT", [0.1*x for x in range(1,10)], 0.1))
print("similarity quantiles", df.approxQuantile("similarity", [0.1*x for x in range(1,10)], 0.1))
df = df.withColumn("lentext", F.length("TEXT"))
print("text length quantiles", df.approxQuantile("lentext", [0.1*x for x in range(1,10)], 0.1))
print("Number of uniques", df.count())
|-- SAMPLE_ID: long (nullable = true) |-- URL: string (nullable = true)
|-- TEXT: string (nullable = true)
|-- HEIGHT: integer (nullable = true)
|-- WIDTH: integer (nullable = true) |-- LICENSE: string (nullable = true)
|-- NSFW: string (nullable = true) |-- similarity: double (nullable = true)
|41826002453| |Hoop Earrings Beaded Hoop Earrings |200 |200 |? |UNLIKELY|0.3015734851360321 |
|11286064458| |Practical Guide to Secondary Social Studies, A |187 |187 |? |UNLIKELY|0.3707329034805298 |
|15923025895||Young glamorous blonde with shopping bag holding toy terrier dogs holding dog — Stock Photo #9991553|398 |600 |? |UNLIKELY|0.3059745728969574 |
|10787043769| |Wooden White Net and Rope Lighthouse 15 |400 |300 |? |UNLIKELY|0.3204681873321533 |
|30762015899| |3 bed Cottage for sale in Berkeley, Gloucestershire |135 |90 |? |UNLIKELY|0.31845608353614807|
|33202002260| |Hang Glide over Cowichan |120 |90 |? |UNLIKELY|0.32427337765693665|
|42981003590| |ANY Size ANY Colorway x High-Waisted Aztec Frayed Denim Shorts |200 |200 |? |UNLIKELY|0.3527474105358124 |
|17772022270| |yellow Wholesale-Dress bag - blue Stradivarius top - cream Mango skirt |300 |450 |? |UNLIKELY|0.3499804437160492 |
|27789015457| |Couple of amazing black dobermans - Foto de Stock |110 |110 |? |UNLIKELY|0.3847452998161316 |
|34655009450| |Army girl gets fucked by ... |190 |143 |? |UNLIKELY|0.3381859362125397 |
|10112032340| |Elegance Shawl / Scarf with Lacy Edge - leopard- |570 |715 |? |UNLIKELY|0.34083840250968933|
|23205009139| |Image of mastermind JAPAN x Carhartt 2012 Fall/Winter Capsule Collection |570 |854 |? |UNLIKELY|0.3161979019641876 |
|8161023068 | |Magnolia Home Fashions Oxford Stripe Charcoal |150 |150 |? |UNLIKELY|0.3259333670139313 |
|4838750426 | |"Keep Calm" - Blue Canvas |130 |130 | |UNLIKELY|0.33144283294677734|
|3469000780 | |Thomas Eggar |82 |82 | |UNLIKELY|0.35470208525657654|
|12408032134| |ornate formal black white damask custom tie |216 |216 |? |UNLIKELY|0.3525067865848541 |
|24984002927| |The old Iten Biscuit Co. is now a U-Haul center. THE OKLAHOMAN ARCHIVES |640 |511 |? |UNLIKELY|0.3311549127101898 |
|12525060020|$P_PROD$ |Lauren Ralph Lauren Plus Size Drawstring Cotton Cropped Pant |233 |338 |? |UNLIKELY|0.3262045085430145 |
|12231074283| |Newcastle United |698 |313 |? |UNLIKELY|0.3026922047138214 |
|42445000967| |Upgrade Your CatGenie with the Power Flush System for Free! |320 |150 |? |UNLIKELY|0.3295552730560303 |
only showing top 20 rows
width quantiles [0.0, 120.0, 151.0, 180.0, 215.0, 270.0, 273.0, 370.0, 39580.0]
height quantiles [0.0, 128.0, 160.0, 184.0, 216.0, 250.0, 300.0, 446.0, 18849.0]
similarity quantiles [0.0, 0.3069250285625458, 0.3145156800746918, 0.32135993242263794, 0.32168570160865784, 0.32813096046447754, 0.339599609375, 0.35535627603530884, 6016.0]
text length quantiles [1.0, 25.0, 33.0, 39.0, 45.0, 50.0, 56.0, 73.0, 61192.0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment