shilad/load-sitelinks.sh

## load-sitelinks.sh
#!/usr/bin/env bash
#
# Extracts sitelinks from the wikidata dump file and loads them into Hive.
#
# Author: Shilad Sen
#

# Name of script
ME=$0

# Wikidata dump file
SRC_FILE=/mnt/data/xmldatadumps/public/wikidatawiki/entities/latest-all.json.bz2

# Local destination of TSV file with sitelinks that will be loaded to Hive
DEST_TSV=~/sitelinks.tsv

# Name of Hive table
TABLE=shilad.sitelinks

# Name of snapshot
SNAPSHOT=$(date '+%Y-%m')


function die() {
    echo $@ >&2
    exit 1
}


function usage() {
    die "
    Usage: $ME
            --src wikidata-json-path
            --tsv sitelinks-tsv-location
            --table hive-tablename
            --snapshot name of snapshot
"
}

while [ "$1" != "" ]; do
    PARAM=$1
    VALUE=$2
    case $PARAM in
        -h | --help)
            usage
            exit
            ;;
        --src)
            SRC_URL=$VALUE
            ;;
        --tsv)
            DEST_TSV=$VALUE
            ;;
        --table)
            TABLE=$VALUE
            ;;
        --snapshot)
            SNAPSHOT=$VALUE
            ;;
        *)
            echo "ERROR: unknown parameter \"$PARAM\""
            usage
            exit 1
            ;;
    esac
    shift
    shift
done


#################################################
# Make sure the JSON file exists
#################################################

[ -f "${SRC_FILE}" ] || die "File ${SRC_FILE} doesnt exist"


#################################################
# Extract sitelinks from JSON file
#################################################

python3 - "${SRC_FILE}" "${DEST_TSV}" << EOF

import bz2
import sys
import json
import traceback

input_path = sys.argv[1]
output_path = sys.argv[2]

if input_path.lower().endswith('bz2'):
    input = bz2.open(input_path, 'rt', encoding='UTF-8')
else:
    input = open(input_path, encoding='UTF-8')

output = open(output_path, 'w', encoding='UTF-8')

for i, line in enumerate(input):
    if i % 10000 == 0:
        sys.stderr.write('processing line %d\n' % i)
    try:
        line = line.strip()
        if line.endswith(','): line = line[:-1]
        if line.startswith('{') and line.endswith('}'):
            js = json.loads(line)
            id = js['id']
            for siteinfo in js.get('sitelinks', {}).values():
                output.write(id + '\t' + siteinfo['site'] + '\t' + siteinfo['title'] + '\n')
    except:
        sys.stdout.write('failure in line %d (%s...)\n' % (i+1, line[:-40]))
        traceback.print_exc()

output.close()

EOF
(($?)) && die "Extracting Sitelinks from ${SRC_FILE} failed"


#################################################
# Load the data into Hive
#################################################

hive -e "
CREATE TABLE IF NOT EXISTS ${TABLE} (
    entity String,
    wiki_db String,
    title String
)
PARTITIONED BY (snapshot String)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

LOAD DATA LOCAL INPATH '${DEST_TSV}'
OVERWRITE INTO TABLE ${TABLE}
PARTITION (snapshot = '${SNAPSHOT}');

" || die "Hive table import"
	#!/usr/bin/env bash
	#
	# Extracts sitelinks from the wikidata dump file and loads them into Hive.
	#
	# Author: Shilad Sen
	#

	# Name of script
	ME=$0

	# Wikidata dump file
	SRC_FILE=/mnt/data/xmldatadumps/public/wikidatawiki/entities/latest-all.json.bz2

	# Local destination of TSV file with sitelinks that will be loaded to Hive
	DEST_TSV=~/sitelinks.tsv

	# Name of Hive table
	TABLE=shilad.sitelinks

	# Name of snapshot
	SNAPSHOT=$(date '+%Y-%m')


	function die() {
	echo $@ >&2
	exit 1
	}


	function usage() {
	die "
	Usage: $ME
	--src wikidata-json-path
	--tsv sitelinks-tsv-location
	--table hive-tablename
	--snapshot name of snapshot
	"
	}

	while [ "$1" != "" ]; do
	PARAM=$1
	VALUE=$2
	case $PARAM in
	-h \| --help)
	usage
	exit
	;;
	--src)
	SRC_URL=$VALUE
	;;
	--tsv)
	DEST_TSV=$VALUE
	;;
	--table)
	TABLE=$VALUE
	;;
	--snapshot)
	SNAPSHOT=$VALUE
	;;
	*)
	echo "ERROR: unknown parameter \"$PARAM\""
	usage
	exit 1
	;;
	esac
	shift
	shift
	done


	#################################################
	# Make sure the JSON file exists
	#################################################

	[ -f "${SRC_FILE}" ] \|\| die "File ${SRC_FILE} doesnt exist"


	#################################################
	# Extract sitelinks from JSON file
	#################################################

	python3 - "${SRC_FILE}" "${DEST_TSV}" << EOF

	import bz2
	import sys
	import json
	import traceback

	input_path = sys.argv[1]
	output_path = sys.argv[2]

	if input_path.lower().endswith('bz2'):
	input = bz2.open(input_path, 'rt', encoding='UTF-8')
	else:
	input = open(input_path, encoding='UTF-8')

	output = open(output_path, 'w', encoding='UTF-8')

	for i, line in enumerate(input):
	if i % 10000 == 0:
	sys.stderr.write('processing line %d\n' % i)
	try:
	line = line.strip()
	if line.endswith(','): line = line[:-1]
	if line.startswith('{') and line.endswith('}'):
	js = json.loads(line)
	id = js['id']
	for siteinfo in js.get('sitelinks', {}).values():
	output.write(id + '\t' + siteinfo['site'] + '\t' + siteinfo['title'] + '\n')
	except:
	sys.stdout.write('failure in line %d (%s...)\n' % (i+1, line[:-40]))
	traceback.print_exc()

	output.close()

	EOF
	(($?)) && die "Extracting Sitelinks from ${SRC_FILE} failed"


	#################################################
	# Load the data into Hive
	#################################################

	hive -e "
	CREATE TABLE IF NOT EXISTS ${TABLE} (
	entity String,
	wiki_db String,
	title String
	)
	PARTITIONED BY (snapshot String)
	ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

	LOAD DATA LOCAL INPATH '${DEST_TSV}'
	OVERWRITE INTO TABLE ${TABLE}
	PARTITION (snapshot = '${SNAPSHOT}');

	" \|\| die "Hive table import"