Last active
November 26, 2017 20:14
-
-
Save shilad/545ea8d93b653cbf92a59c8e34ceea45 to your computer and use it in GitHub Desktop.
Load sitelinks from wikidata json file into Hive.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# | |
# Extracts sitelinks from the wikidata dump file and loads them into Hive. | |
# | |
# Author: Shilad Sen | |
# | |
# Name of script | |
ME=$0 | |
# Wikidata dump file | |
SRC_FILE=/mnt/data/xmldatadumps/public/wikidatawiki/entities/latest-all.json.bz2 | |
# Local destination of TSV file with sitelinks that will be loaded to Hive | |
DEST_TSV=~/sitelinks.tsv | |
# Name of Hive table | |
TABLE=shilad.sitelinks | |
# Name of snapshot | |
SNAPSHOT=$(date '+%Y-%m') | |
function die() { | |
echo $@ >&2 | |
exit 1 | |
} | |
function usage() { | |
die " | |
Usage: $ME | |
--src wikidata-json-path | |
--tsv sitelinks-tsv-location | |
--table hive-tablename | |
--snapshot name of snapshot | |
" | |
} | |
while [ "$1" != "" ]; do | |
PARAM=$1 | |
VALUE=$2 | |
case $PARAM in | |
-h | --help) | |
usage | |
exit | |
;; | |
--src) | |
SRC_URL=$VALUE | |
;; | |
--tsv) | |
DEST_TSV=$VALUE | |
;; | |
--table) | |
TABLE=$VALUE | |
;; | |
--snapshot) | |
SNAPSHOT=$VALUE | |
;; | |
*) | |
echo "ERROR: unknown parameter \"$PARAM\"" | |
usage | |
exit 1 | |
;; | |
esac | |
shift | |
shift | |
done | |
################################################# | |
# Make sure the JSON file exists | |
################################################# | |
[ -f "${SRC_FILE}" ] || die "File ${SRC_FILE} doesnt exist" | |
################################################# | |
# Extract sitelinks from JSON file | |
################################################# | |
python3 - "${SRC_FILE}" "${DEST_TSV}" << EOF | |
import bz2 | |
import sys | |
import json | |
import traceback | |
input_path = sys.argv[1] | |
output_path = sys.argv[2] | |
if input_path.lower().endswith('bz2'): | |
input = bz2.open(input_path, 'rt', encoding='UTF-8') | |
else: | |
input = open(input_path, encoding='UTF-8') | |
output = open(output_path, 'w', encoding='UTF-8') | |
for i, line in enumerate(input): | |
if i % 10000 == 0: | |
sys.stderr.write('processing line %d\n' % i) | |
try: | |
line = line.strip() | |
if line.endswith(','): line = line[:-1] | |
if line.startswith('{') and line.endswith('}'): | |
js = json.loads(line) | |
id = js['id'] | |
for siteinfo in js.get('sitelinks', {}).values(): | |
output.write(id + '\t' + siteinfo['site'] + '\t' + siteinfo['title'] + '\n') | |
except: | |
sys.stdout.write('failure in line %d (%s...)\n' % (i+1, line[:-40])) | |
traceback.print_exc() | |
output.close() | |
EOF | |
(($?)) && die "Extracting Sitelinks from ${SRC_FILE} failed" | |
################################################# | |
# Load the data into Hive | |
################################################# | |
hive -e " | |
CREATE TABLE IF NOT EXISTS ${TABLE} ( | |
entity String, | |
wiki_db String, | |
title String | |
) | |
PARTITIONED BY (snapshot String) | |
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'; | |
LOAD DATA LOCAL INPATH '${DEST_TSV}' | |
OVERWRITE INTO TABLE ${TABLE} | |
PARTITION (snapshot = '${SNAPSHOT}'); | |
" || die "Hive table import" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment