Last active
August 29, 2015 14:21
-
-
Save panchiz/3247c3549041e650a282 to your computer and use it in GitHub Desktop.
WET archives to HDFS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# | |
# Download WET archives from February 2015 Crawl Archive. | |
# Exit immediately if a command exits with a non-zero status. | |
set -e | |
NUMBER_OF_WET=$1 | |
if [ "empty$NUMBER_OF_WET" = "empty" ]; then | |
echo "Use: $0 <number of wet archives>" | |
exit 1 | |
fi | |
AWS_DATASET_URL="https://aws-publicdatasets.s3.amazonaws.com" | |
WET_FILE_URL="$AWS_DATASET_URL/common-crawl/crawl-data/CC-MAIN-2015-11/wet.paths.gz" | |
WET_FILE=$(basename $WET_FILE_URL) | |
WET_ARCHIVE_DIR="$HOME/wet-archives" | |
# If a "WET_ARCHIVES_DIR" is provided then that directory is going to be used | |
# to download and store the wet archives. Otherwise a temporar dir will be created. | |
if [ ! -d $WET_ARCHIVE_DIR ]; then | |
WET_ARCHIVE_DIR=$(mktemp -d) | |
fi | |
cd $WET_ARCHIVE_DIR | |
# Download the wet file if it's not already downloaded. | |
if [ -f $WET_FILE ]; then | |
echo "Great, we already have: $WET_FILE" | |
else | |
wget $WET_FILE_URL | |
fi | |
# Each file size is arround 250Mb. | |
for i in $(gzip -dc $WET_FILE | head -n $NUMBER_OF_WET); do | |
# As the documentation[1] says: | |
# By simply adding either s3://aws-publicdatasets/ or | |
# https://aws-publicdatasets.s3.amazonaws.com/ to each line, | |
# you end up with the S3 and HTTP paths respectively. | |
# [1] http://blog.commoncrawl.org/2015/03/february-2015-crawl-archive-available/ | |
if [ -f $(basename $i) ]; then | |
echo "File exists: $(basename $i)" | |
else | |
wget "$AWS_DATASET_URL/$i" | |
fi | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# | |
# Move WET files to Hadoop fs | |
# Exit immediately if a command exits with a non-zero status. | |
set -e | |
WET_ARCHIVE_DIR="$HOME/wet-archives" | |
HADOOP_WET_WET_ARCHIVE_DIR="/crawl" | |
cd $WET_ARCHIVE_DIR | |
for i in $(ls *warc.wet.gz); do | |
file_name="${i%.*}" | |
if hdfs dfs -test -f "$HADOOP_WET_WET_ARCHIVE_DIR/$file_name"; then | |
echo "Nothing to do with: $i" | |
else | |
echo "Moving: $file_name" | |
gzip -dc $i > $file_name | |
hdfs dfs -moveFromLocal $file_name $HADOOP_WET_WET_ARCHIVE_DIR | |
fi | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment