panchiz/download_wet_files.sh

## download_wet_files.sh
#!/usr/bin/env bash
#
# Download WET archives from February 2015 Crawl Archive.

# Exit immediately if a command exits with a non-zero status.
set -e

NUMBER_OF_WET=$1
if [ "empty$NUMBER_OF_WET" = "empty" ]; then
  echo "Use: $0 <number of wet archives>"
  exit 1
fi

AWS_DATASET_URL="https://aws-publicdatasets.s3.amazonaws.com"
WET_FILE_URL="$AWS_DATASET_URL/common-crawl/crawl-data/CC-MAIN-2015-11/wet.paths.gz"
WET_FILE=$(basename $WET_FILE_URL)
WET_ARCHIVE_DIR="$HOME/wet-archives"

# If a "WET_ARCHIVES_DIR" is provided then that directory is going to be used
# to download and store the wet archives. Otherwise a temporar dir will be created.
if [ ! -d $WET_ARCHIVE_DIR ]; then
  WET_ARCHIVE_DIR=$(mktemp -d)
fi

cd $WET_ARCHIVE_DIR

# Download the wet file if it's not already downloaded.
if [ -f $WET_FILE ]; then
  echo "Great, we already have: $WET_FILE"
else
  wget $WET_FILE_URL
fi

# Each file size is arround 250Mb.
for i in $(gzip -dc $WET_FILE | head -n $NUMBER_OF_WET); do
  # As the documentation[1] says:
  #   By simply adding either s3://aws-publicdatasets/ or
  #   https://aws-publicdatasets.s3.amazonaws.com/ to each line,
  #   you end up with the S3 and HTTP paths respectively.
  # [1] http://blog.commoncrawl.org/2015/03/february-2015-crawl-archive-available/
  if [ -f $(basename $i) ]; then
    echo "File exists: $(basename $i)"
  else
    wget "$AWS_DATASET_URL/$i"
  fi
done

## move_archives_to_hadoop.sh
#!/usr/bin/env bash
#
# Move WET files to Hadoop fs

# Exit immediately if a command exits with a non-zero status.
set -e

WET_ARCHIVE_DIR="$HOME/wet-archives"
HADOOP_WET_WET_ARCHIVE_DIR="/crawl"

cd $WET_ARCHIVE_DIR

for i in $(ls *warc.wet.gz); do
  file_name="${i%.*}"
  if hdfs dfs -test -f "$HADOOP_WET_WET_ARCHIVE_DIR/$file_name"; then
    echo "Nothing to do with: $i"
  else
    echo "Moving: $file_name"
    gzip -dc $i > $file_name
    hdfs dfs -moveFromLocal $file_name $HADOOP_WET_WET_ARCHIVE_DIR
  fi
done
	#!/usr/bin/env bash
	#
	# Download WET archives from February 2015 Crawl Archive.

	# Exit immediately if a command exits with a non-zero status.
	set -e

	NUMBER_OF_WET=$1
	if [ "empty$NUMBER_OF_WET" = "empty" ]; then
	echo "Use: $0 <number of wet archives>"
	exit 1
	fi

	AWS_DATASET_URL="https://aws-publicdatasets.s3.amazonaws.com"
	WET_FILE_URL="$AWS_DATASET_URL/common-crawl/crawl-data/CC-MAIN-2015-11/wet.paths.gz"
	WET_FILE=$(basename $WET_FILE_URL)
	WET_ARCHIVE_DIR="$HOME/wet-archives"

	# If a "WET_ARCHIVES_DIR" is provided then that directory is going to be used
	# to download and store the wet archives. Otherwise a temporar dir will be created.
	if [ ! -d $WET_ARCHIVE_DIR ]; then
	WET_ARCHIVE_DIR=$(mktemp -d)
	fi

	cd $WET_ARCHIVE_DIR

	# Download the wet file if it's not already downloaded.
	if [ -f $WET_FILE ]; then
	echo "Great, we already have: $WET_FILE"
	else
	wget $WET_FILE_URL
	fi

	# Each file size is arround 250Mb.
	for i in $(gzip -dc $WET_FILE \| head -n $NUMBER_OF_WET); do
	# As the documentation[1] says:
	# By simply adding either s3://aws-publicdatasets/ or
	# https://aws-publicdatasets.s3.amazonaws.com/ to each line,
	# you end up with the S3 and HTTP paths respectively.
	# [1] http://blog.commoncrawl.org/2015/03/february-2015-crawl-archive-available/
	if [ -f $(basename $i) ]; then
	echo "File exists: $(basename $i)"
	else
	wget "$AWS_DATASET_URL/$i"
	fi
	done
	#!/usr/bin/env bash
	#
	# Move WET files to Hadoop fs

	# Exit immediately if a command exits with a non-zero status.
	set -e

	WET_ARCHIVE_DIR="$HOME/wet-archives"
	HADOOP_WET_WET_ARCHIVE_DIR="/crawl"

	cd $WET_ARCHIVE_DIR

	for i in $(ls *warc.wet.gz); do
	file_name="${i%.*}"
	if hdfs dfs -test -f "$HADOOP_WET_WET_ARCHIVE_DIR/$file_name"; then
	echo "Nothing to do with: $i"
	else
	echo "Moving: $file_name"
	gzip -dc $i > $file_name
	hdfs dfs -moveFromLocal $file_name $HADOOP_WET_WET_ARCHIVE_DIR
	fi
	done