maartenbreddels/Makefile

## Makefile
# Makefile for converting the CSV files from http://cdn.gea.esac.esa.int/Gaia/gdr2/gaia_source/csv/
# to a single (vaex) hdf5 file
#  * https://docs.vaex.io
#  * https://github.com/maartenbreddels/vaex/
# It is multistage to work around opening 60 000 files at once.
# Strategy is
#  * stage1: convert all cvs.gz to csv to hdf5
#   * do this via xargs and calling make again, since gmake has trouble matching 60 000 rules
#  * stage2: Create part-<NUMBER>.txt files containing max FILES_PER_PART per file
#  * stage3: convert the list of hdf5 files to single hdf5 files (part-<NUMBER>.hdf5)
#  * stage4: convert the partial files, to a single file (SINGLE_FILE)
# Possible use
#   $ make stage1
#   $ make stage2
#   $ make stage3 -j8
#   $ make stage4
FILES_PER_PART = 10

ZIPPED = $(shell find . -maxdepth 1 -type f -name '*.csv.gz')
CSVS   = $(patsubst %.cvs.gz, %.cvs, $(ZIPPED))
HDF5S  = $(patsubst %.csv, %.cvs.hdf5, $(CSVS))

PARTS_TXT = $(shell find . -maxdepth 1 -type f -name 'part*.txt')
PARTS = $(patsubst %.txt, %.hdf5, $(PARTS_TXT))
SINGLE_FILE = gaia-dr2-sort-by-source_id.hdf5

all:
	echo "Read the comments in this makefile"

stage1:
	# change -P8 to ~number of cores on your system
	find . -iname '*csv.gz' | sed 's/gz/hdf5/' | xargs -n30 -P8 make -j8
stage2:
	find $(INPUT_DIR) -iname '*.hdf5' | awk -vc=0 'NR%$(FILES_PER_PART)==0{c++}{print $0 > "part-"c".txt"}'

stage3: $(PARTS)
stage4: $(SINGLE_FILE)


%.csv.hdf5 : %.csv
	vaex convert file $< $@

%.csv : %.csv.gz
	gunzip -c $< > $@

part-%.hdf5: part-%.txt
	vaex convert --progress --sort=source_id file @$< $@

group_%.hdf5: group_%.txt
	vaex convert --progress --sort=source_id file @$< $@

$(SINGLE_FILE): $(PARTS)
	find . -iname 'part*.hdf5' > single.txt
	vaex convert --progress --sort=source_id file @single.txt $@


#.PRECIOUS: %.csv
#.PRECIOUS:  %.csv.hdf5
# don't delete intermediate files
.PRECIOUS:
	# Makefile for converting the CSV files from http://cdn.gea.esac.esa.int/Gaia/gdr2/gaia_source/csv/
	# to a single (vaex) hdf5 file
	# * https://docs.vaex.io
	# * https://github.com/maartenbreddels/vaex/
	# It is multistage to work around opening 60 000 files at once.
	# Strategy is
	# * stage1: convert all cvs.gz to csv to hdf5
	# * do this via xargs and calling make again, since gmake has trouble matching 60 000 rules
	# * stage2: Create part-<NUMBER>.txt files containing max FILES_PER_PART per file
	# * stage3: convert the list of hdf5 files to single hdf5 files (part-<NUMBER>.hdf5)
	# * stage4: convert the partial files, to a single file (SINGLE_FILE)
	# Possible use
	# $ make stage1
	# $ make stage2
	# $ make stage3 -j8
	# $ make stage4
	FILES_PER_PART = 10

	ZIPPED = $(shell find . -maxdepth 1 -type f -name '*.csv.gz')
	CSVS = $(patsubst %.cvs.gz, %.cvs, $(ZIPPED))
	HDF5S = $(patsubst %.csv, %.cvs.hdf5, $(CSVS))

	PARTS_TXT = $(shell find . -maxdepth 1 -type f -name 'part*.txt')
	PARTS = $(patsubst %.txt, %.hdf5, $(PARTS_TXT))
	SINGLE_FILE = gaia-dr2-sort-by-source_id.hdf5

	all:
	echo "Read the comments in this makefile"

	stage1:
	# change -P8 to ~number of cores on your system
	find . -iname '*csv.gz' \| sed 's/gz/hdf5/' \| xargs -n30 -P8 make -j8
	stage2:
	find $(INPUT_DIR) -iname '*.hdf5' \| awk -vc=0 'NR%$(FILES_PER_PART)==0{c++}{print $0 > "part-"c".txt"}'

	stage3: $(PARTS)
	stage4: $(SINGLE_FILE)


	%.csv.hdf5 : %.csv
	vaex convert file $< $@

	%.csv : %.csv.gz
	gunzip -c $< > $@

	part-%.hdf5: part-%.txt
	vaex convert --progress --sort=source_id file @$< $@

	group_%.hdf5: group_%.txt
	vaex convert --progress --sort=source_id file @$< $@

	$(SINGLE_FILE): $(PARTS)
	find . -iname 'part*.hdf5' > single.txt
	vaex convert --progress --sort=source_id file @single.txt $@


	#.PRECIOUS: %.csv
	#.PRECIOUS: %.csv.hdf5
	# don't delete intermediate files
	.PRECIOUS: