Last active
April 8, 2021 10:29
-
-
Save emrahyildiz/0ade8255ff56fcee6b3e77f5a0c7744b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#Merge small files in Hadoop | |
PARTDIR=/path/to/part | |
mkdir -p $PARTDIR | |
HADOOPPATH=/path/to/hadoop | |
DESIREDPATH=/path/to/hdfs/formerge # HDFS path for merge | |
SAFEDIR=/path/to/hdfs/safe # HDFS path for big files | |
DESIREDSIZE=250000000 | |
PART=$(echo $(head -n 1 $PARTDIR/part.txt)) # You can check part from part.txt if you want. Note that, this may NOT be the best practice | |
SIZE=0 | |
bash $HADOOPPATH/hdfs dfs -mkdir -p $SAFEDIR | |
SAFEFILE=part-$PART | |
$HADOOPPATH/hdfs dfs -touchz $SAFEDIR/$SAFEFILE | |
wait | |
for filename in `bash $HADOOPPATH/hdfs dfs -ls -R $DESIREDPATH | awk '{print $NF}' | tr '\n' ' '` | |
do | |
echo $SIZE | |
if [ "$SIZE" -gt "$DESIREDSIZE" ]; ## Dirty hack for double size | |
then | |
PART=$(expr $PART + 1) | |
echo "START FROM PART $PART WITH SIZE $SIZE" | |
SAFEFILE=part-$PART | |
$HADOOPPATH/hdfs dfs -touchz $SAFEDIR/$SAFEFILE | |
SIZE=0 | |
fi | |
FILESIZE=$(bash $HADOOPPATH/hdfs dfs -du -s $filename | awk '{s+=$1} END {printf "%8d\n", s/1}') | |
if [ "$FILESIZE" -gt 0 ]; | |
then | |
echo "FILENAME:" $filename "SIZE:" $FILESIZE; | |
$HADOOPPATH/hdfs dfs -cat $filename | $HADOOPPATH/hdfs dfs -appendToFile - $SAFEDIR/$SAFEFILE | |
SIZE=$(expr $SIZE + $FILESIZE) | |
echo "TOTAL SIZE:" $SIZE | |
fi | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment