rvencu/cah_stats_spark.py

## cah_stats_spark.py
'''
Compute some stats on cah collection
First get the files with:
lynx -dump -hiddenlinks=listonly -nonumbers http://the-eye.eu/eleuther_staging/cah/ | grep cah  | grep .csv > cah.csv
aria2c --dir=shards --auto-file-renaming=false --continue=true -i cah.csv -x 16 -s 16 -j 100
Takes a few minutes to run
Then pip install pyspark
Then run this file. It also takes a few minutes
'''

if __name__ == '__main__':

  from pyspark.sql import SparkSession
  # You can open http://localhost:4040 to follow progress on the spark operations
  spark = SparkSession.builder.config("spark.driver.memory", "16G") .master("local[16]").appName('spark-stats').getOrCreate()

  cah = spark.read.options(header='True').csv("shards", sep="|")
  cah.select("URL", "TEXT").write.parquet("some-parquet")
  cah = spark.read.parquet("some-parquet")
  print("Size of collection", cah.count())
  print("Number of uniques", cah.drop_duplicates().count())
	'''
	Compute some stats on cah collection
	First get the files with:
	lynx -dump -hiddenlinks=listonly -nonumbers http://the-eye.eu/eleuther_staging/cah/ \| grep cah \| grep .csv > cah.csv
	aria2c --dir=shards --auto-file-renaming=false --continue=true -i cah.csv -x 16 -s 16 -j 100
	Takes a few minutes to run
	Then pip install pyspark
	Then run this file. It also takes a few minutes
	'''

	if __name__ == '__main__':

	from pyspark.sql import SparkSession
	# You can open http://localhost:4040 to follow progress on the spark operations
	spark = SparkSession.builder.config("spark.driver.memory", "16G") .master("local[16]").appName('spark-stats').getOrCreate()

	cah = spark.read.options(header='True').csv("shards", sep="\|")
	cah.select("URL", "TEXT").write.parquet("some-parquet")
	cah = spark.read.parquet("some-parquet")
	print("Size of collection", cah.count())
	print("Number of uniques", cah.drop_duplicates().count())