yfujieda/python-dask-example.py

## python-dask-example.py
# using dask to handle the big dataset
import dask.dataframe as dd

# use numpy to conver the dask dataframe object to array object
import numpy as np

# load the dataset file
file_name = 'sample_data_set.csv'

#row_count to output
row_count = int(10)

# read the csv file and convert it to dask dataframe
df = dd.read_csv(file_name,  error_bad_lines=False)

# using nlargest to return the first n rows ordered by columns in descending order.
# in this case, use 'NUM_VALUE' column as a column to order by
# reference: http://docs.dask.org/en/latest/dataframe-api.html?highlight=nlargest#dask.dataframe.DataFrame.nlargest
df2 = df.nlargest(row_count, 'NUM_VALUE')

# get the UIDs of n rows
l = df2['UID'].values

# convert the extracted UIDs to array object
n = np.array(l)

# output the UIDs
for x in n:
    print(x)
	# using dask to handle the big dataset
	import dask.dataframe as dd

	# use numpy to conver the dask dataframe object to array object
	import numpy as np

	# load the dataset file
	file_name = 'sample_data_set.csv'

	#row_count to output
	row_count = int(10)

	# read the csv file and convert it to dask dataframe
	df = dd.read_csv(file_name, error_bad_lines=False)

	# using nlargest to return the first n rows ordered by columns in descending order.
	# in this case, use 'NUM_VALUE' column as a column to order by
	# reference: http://docs.dask.org/en/latest/dataframe-api.html?highlight=nlargest#dask.dataframe.DataFrame.nlargest
	df2 = df.nlargest(row_count, 'NUM_VALUE')

	# get the UIDs of n rows
	l = df2['UID'].values

	# convert the extracted UIDs to array object
	n = np.array(l)

	# output the UIDs
	for x in n:
	print(x)