fnielsen/Nielsen2017Linking_camera.py

## Nielsen2017Linking_camera.py
"""
Usage:
  Nielsen2017Linking_camera.py

Notes
-----
This script demonstrates the use of Wikidata together with
ImageNet-based deep learning classifiers. It relates to the manuscript
"Linking ImageNet WordNet Synsets with Wikidata" from 2018.

Keras is used together with OpenCV and a pre-trained deep learning
model. The script requires the installation of at least these
component, a third-party `keras_squeezenet` package, as well as a
webcam. Internet access is required for the Danish labels as Wikidata
is queried each time the model detects a new object. The pre-trained
model used is downloaded the first time the script is run and stored
locally under `~/.keras/`.

There are several parameters in the code that might need to be
adjusted. The model can be set to, e.g., MobileNet, Resnet50 or few
other pre-trained model. The webcam that has been used had a high
resolution with a height of 1080 pixel. Depending on the webcam or
screen resolution the size and `step` parameter might need to be
changed. The language of the labels on the screen can be changed,
e.g., from 'da' to 'de' for German.

Labels it cannot resolve are written to the terminal.

The script has run successfully under Ubuntu 17.10 with Python2 and
Python3 and tensorflow-gpu==1.4.0, Keras==2.1.4 and Cuda 8.0

Citation
--------
Finn Aarup Nielsen, Linking ImageNet WordNet Synsets with Wikidata
Wiki Workshop 2018.

Copyright
---------
Technical University of Denmark
Finn Aarup Nielsen

License
-------
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0

Funding
-------
Innovation Foundation Denmark through the DABAI project

"""


from keras.applications import mobilenet
from keras.applications import densenet
from keras.applications import inception_resnet_v2
import keras_squeezenet as squeezenet
from keras.preprocessing import image
from keras.applications import resnet50
import numpy as np
import cv2
try:
    from functools32 import lru_cache
except ImportError:
    from functools import lru_cache

import requests
from six import u
from time import time
from unidecode import unidecode


QUERY = """
SELECT ?item ?prefix ?synset WHERE {
  ?item wdt:P2888 ?uri .
  BIND (SUBSTR(STR(?uri), 1, 38) AS ?prefix)
  BIND (SUBSTR(STR(?uri), 39) AS ?synset)
  FILTER (?prefix = "http://wordnet-rdf.princeton.edu/wn30/")
}
"""

def synset_to_uri(synset):
    return "http://wordnet-rdf.princeton.edu/wn30/{}-n".format(synset[1:])


SYNSET_SPARQL = """
SELECT ?item ?itemLabel WHERE {{
  ?item wdt:P2888 <http://wordnet-rdf.princeton.edu/wn30/{}-n>
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{}". }}
}}
"""
@lru_cache(maxsize=1000)
def synset_to_label(synset, language='da'):
    query = SYNSET_SPARQL.format(synset[1:], language)
    url = 'https://query.wikidata.org/sparql'
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params)
    data = response.json()
    labels = [item['itemLabel']['value']
              for item in data['results']['bindings']]
    if len(labels) > 0:
        return labels[0]
    else:
        return "???"


def unicode_to_ascii(text):
    encoded = ''
    for character in text:
        if character == u('\xe5'):
            encoded += 'aa'
        elif character == u('\xe6'):
            encoded += 'ae'
        elif character == u('\xf8'):
            encoded += 'oe'
        elif character == u('\xf6'):
            encoded += 'oe'
        elif character == u('\xe4'):
            encoded += 'ae'
        elif character == u('\xfc'):
            encoded += 'u'
        else:
            encoded += character
    return unidecode(encoded)


model_module = mobilenet

preprocess_input = model_module.preprocess_input
decode_predictions = model_module.decode_predictions

model_name = model_module.__name__.split('.')[-1]

if model_name == 'resnet50':
    model = model_module.ResNet50()
    model_image_size = 224
elif model_name == 'squeezenet':
    model = model_module.SqueezeNet()
    model_image_size = 227
elif model_name == 'mobilenet':
    model = model_module.MobileNet()
    model_image_size = 224
elif model_name == 'densenet':
    model = model_module.DenseNet121()
    model_image_size = 224
elif model_name == 'inceptionresnetv2':
    model = model_module.InceptionResNetV2()
    model_image_size = 299
else:
    assert False


step = 3

font = cv2.FONT_HERSHEY_PLAIN
text_position = (10, 500)
font_scale = 1
font_color = (255, 255, 255)
line_type = 1


synset_to_label.cache_clear()

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

# Camera view and screen size may not fit. Fullscreen is disabled for
# now.
# cv2.namedWindow("frame", cv2.WND_PROP_FULLSCREEN)
# cv2.setWindowProperty("frame", cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)

capturer = cv2.VideoCapture(0)
capturer.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
capturer.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)

previous_time = 0
while(True):
    # Capture frame-by-frame
    ret, frame = capturer.read()

    # Preprocess image
    x_offset = (frame.shape[0] - model_image_size * step) // 2
    y_offset = (frame.shape[1] - model_image_size * step) // 2
    x_cropped = frame[x_offset:x_offset + model_image_size * step:step,
                      y_offset:y_offset + model_image_size * step:step, :]
    x = np.expand_dims(x_cropped, axis=0).astype('float32')
    x = preprocess_input(x)

    # Forward in neural network
    predictions = model.predict(x)

    # Convert predictions
    decoded = decode_predictions(predictions)

    # Attempt to label
    label = synset_to_label(decoded[0][0][0], language='da')

    # Size of font depends on probability
    size = int(decoded[0][0][2] * 4)

    # If the label is not found some information is printed on the terminal
    if label == '???':
        message = "{} - http://image-net.org/explore.php?wnid={} - {}"
        print(message.format(synset_to_uri(decoded[0][0][0]),
                             decoded[0][0][0],
                             decoded[0][0][1]))
        label = '(' + decoded[0][0][1] + ')'
    if label.startswith('Q'):
        print("https://www.wikidata.org/wiki/" + label)
        label = '(' + decoded[0][0][1] + ')'

    # Add the label to the image
    _ = cv2.putText(frame, unicode_to_ascii(label),
                    text_position, font, font_scale + size, (0, 0, 0), 3)
    _ = cv2.putText(frame, unicode_to_ascii(label),
                    text_position, font, font_scale + size,
                    font_color, line_type)

    # Show the image on the screen
    _ = cv2.imshow('frame', frame)

    # Break if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

    # Enable for simple benchmarking
    if False:
        now_time = time()
        print(now_time - previous_time)
        previous_time = now_time


capturer.release()
cv2.destroyAllWindows()
	"""
	Usage:
	Nielsen2017Linking_camera.py

	Notes
	-----
	This script demonstrates the use of Wikidata together with
	ImageNet-based deep learning classifiers. It relates to the manuscript
	"Linking ImageNet WordNet Synsets with Wikidata" from 2018.

	Keras is used together with OpenCV and a pre-trained deep learning
	model. The script requires the installation of at least these
	component, a third-party `keras_squeezenet` package, as well as a
	webcam. Internet access is required for the Danish labels as Wikidata
	is queried each time the model detects a new object. The pre-trained
	model used is downloaded the first time the script is run and stored
	locally under `~/.keras/`.

	There are several parameters in the code that might need to be
	adjusted. The model can be set to, e.g., MobileNet, Resnet50 or few
	other pre-trained model. The webcam that has been used had a high
	resolution with a height of 1080 pixel. Depending on the webcam or
	screen resolution the size and `step` parameter might need to be
	changed. The language of the labels on the screen can be changed,
	e.g., from 'da' to 'de' for German.

	Labels it cannot resolve are written to the terminal.

	The script has run successfully under Ubuntu 17.10 with Python2 and
	Python3 and tensorflow-gpu==1.4.0, Keras==2.1.4 and Cuda 8.0

	Citation
	--------
	Finn Aarup Nielsen, Linking ImageNet WordNet Synsets with Wikidata
	Wiki Workshop 2018.

	Copyright
	---------
	Technical University of Denmark
	Finn Aarup Nielsen

	License
	-------
	Apache License, Version 2.0
	https://www.apache.org/licenses/LICENSE-2.0

	Funding
	-------
	Innovation Foundation Denmark through the DABAI project

	"""


	from keras.applications import mobilenet
	from keras.applications import densenet
	from keras.applications import inception_resnet_v2
	import keras_squeezenet as squeezenet
	from keras.preprocessing import image
	from keras.applications import resnet50
	import numpy as np
	import cv2
	try:
	from functools32 import lru_cache
	except ImportError:
	from functools import lru_cache

	import requests
	from six import u
	from time import time
	from unidecode import unidecode


	QUERY = """
	SELECT ?item ?prefix ?synset WHERE {
	?item wdt:P2888 ?uri .
	BIND (SUBSTR(STR(?uri), 1, 38) AS ?prefix)
	BIND (SUBSTR(STR(?uri), 39) AS ?synset)
	FILTER (?prefix = "http://wordnet-rdf.princeton.edu/wn30/")
	}
	"""

	def synset_to_uri(synset):
	return "http://wordnet-rdf.princeton.edu/wn30/{}-n".format(synset[1:])


	SYNSET_SPARQL = """
	SELECT ?item ?itemLabel WHERE {{
	?item wdt:P2888 <http://wordnet-rdf.princeton.edu/wn30/{}-n>
	SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{}". }}
	}}
	"""
	@lru_cache(maxsize=1000)
	def synset_to_label(synset, language='da'):
	query = SYNSET_SPARQL.format(synset[1:], language)
	url = 'https://query.wikidata.org/sparql'
	params = {'query': query, 'format': 'json'}
	response = requests.get(url, params=params)
	data = response.json()
	labels = [item['itemLabel']['value']
	for item in data['results']['bindings']]
	if len(labels) > 0:
	return labels[0]
	else:
	return "???"


	def unicode_to_ascii(text):
	encoded = ''
	for character in text:
	if character == u('\xe5'):
	encoded += 'aa'
	elif character == u('\xe6'):
	encoded += 'ae'
	elif character == u('\xf8'):
	encoded += 'oe'
	elif character == u('\xf6'):
	encoded += 'oe'
	elif character == u('\xe4'):
	encoded += 'ae'
	elif character == u('\xfc'):
	encoded += 'u'
	else:
	encoded += character
	return unidecode(encoded)


	model_module = mobilenet

	preprocess_input = model_module.preprocess_input
	decode_predictions = model_module.decode_predictions

	model_name = model_module.__name__.split('.')[-1]

	if model_name == 'resnet50':
	model = model_module.ResNet50()
	model_image_size = 224
	elif model_name == 'squeezenet':
	model = model_module.SqueezeNet()
	model_image_size = 227
	elif model_name == 'mobilenet':
	model = model_module.MobileNet()
	model_image_size = 224
	elif model_name == 'densenet':
	model = model_module.DenseNet121()
	model_image_size = 224
	elif model_name == 'inceptionresnetv2':
	model = model_module.InceptionResNetV2()
	model_image_size = 299
	else:
	assert False


	step = 3

	font = cv2.FONT_HERSHEY_PLAIN
	text_position = (10, 500)
	font_scale = 1
	font_color = (255, 255, 255)
	line_type = 1


	synset_to_label.cache_clear()

	from tensorflow.python.client import device_lib
	print(device_lib.list_local_devices())

	# Camera view and screen size may not fit. Fullscreen is disabled for
	# now.
	# cv2.namedWindow("frame", cv2.WND_PROP_FULLSCREEN)
	# cv2.setWindowProperty("frame", cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)

	capturer = cv2.VideoCapture(0)
	capturer.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
	capturer.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)

	previous_time = 0
	while(True):
	# Capture frame-by-frame
	ret, frame = capturer.read()

	# Preprocess image
	x_offset = (frame.shape[0] - model_image_size * step) // 2
	y_offset = (frame.shape[1] - model_image_size * step) // 2
	x_cropped = frame[x_offset:x_offset + model_image_size * step:step,
	y_offset:y_offset + model_image_size * step:step, :]
	x = np.expand_dims(x_cropped, axis=0).astype('float32')
	x = preprocess_input(x)

	# Forward in neural network
	predictions = model.predict(x)

	# Convert predictions
	decoded = decode_predictions(predictions)

	# Attempt to label
	label = synset_to_label(decoded[0][0][0], language='da')

	# Size of font depends on probability
	size = int(decoded[0][0][2] * 4)

	# If the label is not found some information is printed on the terminal
	if label == '???':
	message = "{} - http://image-net.org/explore.php?wnid={} - {}"
	print(message.format(synset_to_uri(decoded[0][0][0]),
	decoded[0][0][0],
	decoded[0][0][1]))
	label = '(' + decoded[0][0][1] + ')'
	if label.startswith('Q'):
	print("https://www.wikidata.org/wiki/" + label)
	label = '(' + decoded[0][0][1] + ')'

	# Add the label to the image
	_ = cv2.putText(frame, unicode_to_ascii(label),
	text_position, font, font_scale + size, (0, 0, 0), 3)
	_ = cv2.putText(frame, unicode_to_ascii(label),
	text_position, font, font_scale + size,
	font_color, line_type)

	# Show the image on the screen
	_ = cv2.imshow('frame', frame)

	# Break if 'q' is pressed
	if cv2.waitKey(1) & 0xFF == ord('q'):
	break

	# Enable for simple benchmarking
	if False:
	now_time = time()
	print(now_time - previous_time)
	previous_time = now_time


	capturer.release()
	cv2.destroyAllWindows()