haotian-liu/1_eval.py

## 1_eval.py
# Evaluation code for GQA.
# Computes a suite of metrics such as accuracy, consistency, plausibility and scores per question type and length.
# Visit https://gqadataset.org/ for all information about the dataset, including examples, visualizations, paper and slides.
#
#
# Metrics:
# - Accuracy: Standard accuracy, computed over the balanced version of the dataset, which is more robust against
#             cheating by making educated guesses. For each question-answer pair (q,a), we give 1 point if the
#             predicted answer p matches a and 0 otherwise, and average over all questions in the dataset.
#
# - Consistency: A metric for the level of model's consistency across different questions. For each question-answer
#                pair (q,a), we define a set Eq={q1, q2, ..., qn} of entailed questions, the answers to which can
#                be unambiguously inferred given (q,a).
#                Denote Q the set of all questions the model answered correctly. For each question q in Q, we
#                measure the model's accuracy over the entailed questions Eq to get the score sq and finally
#                average these results across all questions in Q.
#
# - Validity: Measures whether the model gives a "valid" answer - one that can theoretically be an answer
#             to the question (e.g. a color to a color question, yes/no to a binary question etc.).
#             We provide a set of valid answers to each questions over the final answer vocabulary, in
#             the choices file, and use it to compute average validity across the dataset.
#
# - Plausibility: Measures whether the model answers are plausible, e.g. one that make sense in the real world,
#                 e.g. not answering "purple" to a question about apple color (unless it's really purple).
#                 We provide a set of all plausible answers to each questions, computed by looking at all
#                 attributes and relations hold for various objects throughout the whole dataset scene graphs,
#                 and use it to compute average model plausibility across the data.
#
# - Grounding: Only for attention models. Measures whether the model looks at the relevant regions in the
#              image when answering a question. Each question in the dataset is annotated with the visual regions
#              they refer to, which are then used to compute the level to which the model has a correct visual attention,
#              which will allow to identify whether it really answers based on the image of by language-based guesses.
#              Supports both spatial features and object-based features.
#
# - Distribution: Measures the overall match between the true answer distribution for different questions,
#                 vs the overall distribution predicted by the model through its answers for all the data.
#                 We use chi-square statistic to measure the degree of similarity between the distributions,
#                 giving indication to the level of overall world-knowledge of the model
#
# - Accuracy per type: accuracy per question structural types (logic, compare, choose), and semantic type
#                      (questions about attributes, relations, categories, objects or the whole scene).
#
# - Accuracy for length: accuracy as a function of the question length, in terms of (1) words number, and semantic
#                        complexity - number of reasoning steps.
#
# We may support additional metrics (e.g. coverage) in the future.
#
#
# Files format:
# - predictions file format: JSON array: [{"questionId": str, "prediction": str}]
# - attentions file format: JSON array:
#   Spatial attention: [{"questionId": str, "attention": [mapSize x mapSize: float] }].
#   Object-based attention:[{"questionId": str, "attention": [[x0, y0, x1, y1, float] x #regions] }]. 0 < x,y < 1.
# - questions and choices files are provided as part of the dataset.
#   see https://gqadataset.org/download.html for information about their format.
#
#
# If you have any questions or comments, please feel free to send an email,
# at dorarad@cs.stanford.edu. We hope you'll enjoy using the GQA dataset! :)
#
#

from collections import defaultdict
from tqdm import tqdm
import argparse
import os.path
import glob
import json
import math

##### Arguments
##########################################################################################

parser = argparse.ArgumentParser()
parser.add_argument('--tier',           default = "val",                     type = str,    help = "Tier, e.g. train, val")
parser.add_argument('--scenes',         default="{tier}_sceneGraphs.json",   type = str,    help = "Scene graphs file name format.")
parser.add_argument('--questions',      default="{tier}_questions.json", type = str,    help = "Questions file name format.")
parser.add_argument('--choices',        default="{tier}_choices.json",       type = str,    help = "Choices file name format.")
parser.add_argument('--predictions',    default="{tier}_predictions.json",   type = str,    help = "Answers file name format.")
parser.add_argument('--attentions',     default="{tier}_attentions.json",    type = str,    help = "Attentions file name format.")
parser.add_argument('--consistency',    action="store_true",        help = "True to compute consistency score (Need to provide answers to questions in val_all_questions.json).")
parser.add_argument('--grounding',      action="store_true",        help = "True to compute grounding score (If model uses attention).")
parser.add_argument('--objectFeatures', action="store_true",        help = "True for object-based attention (False for spatial).")
parser.add_argument('--mapSize',    default = 7,    type = int, help = "Optional, only to get attention score. Images features map size, mapSize * mapSize")
args = parser.parse_args()

print("Please make sure to use our provided visual features as gqadataset.org for better comparability. We provide both spatial and object-based features trained on GQA train set.")
print("In particular please avoid using features from https://github.com/peteanderson80/bottom-up-attention since they were trained on images contained in the GQA validation set and thus may give false scores improvement.\n")

if not args.consistency:
    print("Please consider using --consistency to compute consistency scores for entailed questions.")
    print("If you do so, please provide answers to all questions in val_all_questions.json.\n")

if not args.grounding:
    print("Please consider using --grounding to compute attention scores.")
    print("If you do so, please provide attention maps through --attentions.\n")


##### Files Loading
##########################################################################################

def loadFile(name):
    # load standard json file
    if os.path.isfile(name):
        with open(name) as file:
            data = json.load(file)
    # load file chunks if too big
    elif os.path.isdir(name.split(".")[0]):
        data = {}
        chunks = glob.glob('{dir}/{dir}_*.{ext}'.format(dir = name.split(".")[0], ext = name.split(".")[1]))
        for chunk in chunks:
            with open(chunk) as file:
                data.update(json.load(file))
    else:
        raise Exception("Can't find {}".format(name))
    return data

# Load scene graphs
# print("Loading scene graphs...")
# scenes = loadFile(args.scenes.format(tier = args.tier))

# Load questions
print("Loading questions...")
questions = loadFile(args.questions.format(tier = args.tier))

# # Load choices
# print("Loading choices...")
# choices = loadFile(args.choices.format(tier = args.tier))

# Load predictions and turn them into a dictionary
print("Loading predictions...")
predictions = loadFile(args.predictions.format(tier = args.tier))
predictions = {p["questionId"]: p["prediction"] for p in predictions}

# Make sure all question have predictions
for qid in questions:
    if (qid not in predictions) and (args.consistency or questions[qid]["isBalanced"]):
        print("no prediction for question {}. Please add prediction for all questions.".format(qid))
        raise Exception("missing predictions")

# Load attentions and turn them into a dictionary
attentions = None
if args.grounding:
    with open(args.attentions.format(tier = args.tier)) as attentionsFile:
        attentions = json.load(attentionsFile)
        attentions = {a["questionId"]: a["attention"] for a in attentions}

##### Scores data structures initialization
##########################################################################################

# book to float
def toScore(b):
    return float(1 if b else 0)

# Compute average of a list
def avg(l):
    if len(l) == 0:
        return 0
    return float(sum(l)) / len(l)

def wavg(l, w):
    if sum(w) == 0:
        return None
    return float(sum(l[i] * w[i] for i in range(len(l)))) / sum(w)

# Initialize data structure to track all metrics: e.g. accuracy, validity and plausibility, as well as
# accuracy per question type, length and number of reasoning steps.
scores = {
    "accuracy": [], # list of accuracies per question (1 if correct else 0). Will be averaged ultimately.
    "binary": [], # list of accuracies per a binary question (1 if correct else 0). Will be averaged ultimately.
    "open": [], # list of accuracies per an open question (1 if correct else 0). Will be averaged ultimately.
    "validity": [], # list of validity per question (1 if valid else 0).
    "plausibility": [], # list of plausibility per question (1 if plausible else 0).
    "consistency": [], # list of consistency scores for entailed questions.
    "accuracyPerStructuralType": defaultdict(list), # list of question accuracies for each structural type (e.g. compare, logic questions).
    "accuracyPerSemanticType": defaultdict(list), # list of question accuracies for each semantic type (e.g. questions about an object, an attribute, a relation).
    "accuracyPerLength": defaultdict(list), # list of question accuracies per question's word number.
    "accuracyPerSteps": defaultdict(list), # list of question accuracies per question's reasoning length (steps number).
    "grounding": [] # list of grounding scores for each question.
}

# Initialize golden and predicted histograms per each question group. Used to compute the distribution metric.
dist = {
    "gold": defaultdict(lambda: defaultdict(int)),
    "predicted": defaultdict(lambda: defaultdict(int))
}


##### Question lengths - words numbers and reasoning steps number
##########################################################################################

# Compute question length (words number)
def getWordsNum(question):
    return len(question["question"].split())

# Compute number of reasoning steps (excluding the final "querying" step which doesn't increase effective reasoning length)
def getStepsNum(question):
    return len([c for c in question["semantic"] if not (any([o in "{}: {}".format(c["operation"], c["argument"])
        for o in ["exist", "query: name", "choose name"]]))])


##### Functions for question annotations
##########################################################################################

# Utility function for converting question annotations string keys to slices
def toSlice(strSlice):
    sliceLims = (int(n) for n in strSlice.split(':'))
    return apply(slice, sliceLims)

# Utility function for converting question annotations string keys to indexes list:
# "1" => [0]
# "1:3" => [1, 2]
# "4:9:2" => [4, 6, 8]
def intsFromSlice(strSlice):
    slice_obj = get_slice_obj(slicearg)
    return(range(slice_obj.start or 0, slice_obj.stop or -1, slice_obj.step or 1))

##### Functions for validity and plausibility
##########################################################################################

def belongs(element, group, question):
    # normalization ()
    if "Common" in question["types"]["detailed"]:
        group = ["color", "material", "shape"]

    return element in group

##### Functions for consistency scores (for entailed questions ("inferred"))
##########################################################################################

def updateConsistency(questionId, question, questions):
    inferredQuestions = [eid for eid in question["entailed"] if eid != questionId]

    if correct and len(inferredQuestions) > 0:

        cosnsitencyScores = []
        for eid in inferredQuestions:
            gold = questions[eid]["answer"]
            predicted = predictions[eid]
            score = toScore(predicted == gold)
            cosnsitencyScores.append(score)

        scores["consistency"].append(avg(cosnsitencyScores))

##### Functions for grounding score (optional, only for attention models)
##########################################################################################

# Utility functions for working with bounding boxes.
# c = (x0, y0, x1, y1), r = (r0, r1)

def yrange(c):
    return (c[1], c[3])

def xrange(c):
    return (c[0], c[2])

def length(r):
    if r is None:
        return 0
    return float(r[1] - r[0])

def size(c):
    return length(xrange(c)) * length(yrange(c))

def intersection(r1, r2):
    ir = (max(r1[0], r2[0]), min(r1[1], r2[1]))
    if ir[1] > ir[0]:
        return ir
    return None

def intersectionSize(c1, c2):
    return length(intersection(xrange(c1), xrange(c2))) * length(intersection(yrange(c1), yrange(c2)))

def intersectionRate(c1, c2):
    return float(intersectionSize(c1, c2)) / size(c1)

# Get spatial cell
def getCell(i, j):
    edge = float(1) / args.mapSize
    return (edge * i, edge * j, edge * (i + 1), edge * (j + 1))

# Get bounding box of objectId in sceneGraph
def getRegion(sceneGraph, objectId):
    obj = sceneGraph["objects"][objectId]
    x0 = float(obj["x"]) / sceneGraph["width"]
    y0 = float(obj["y"]) / sceneGraph["height"]
    x1 = float(obj["x"] + obj["w"]) / sceneGraph["width"]
    y1 = float(obj["y"] + obj["h"]) / sceneGraph["height"]
    return (x0, y0, x1, y1)

# Compute grounding score. Computer amount of attention (probability) given to each of the regions
# the question and answers refer to.
def computeGroundingScore(question, sceneGraph, attentionMap):
    ## prepare gold regions
    regions = []
    # add question regions
    regions += [getRegion(sceneGraph, pointer) for pointer in question["annotations"]["question"].values()]
    # add answer regions
    regions += [getRegion(sceneGraph, pointer) for pointer in question["annotations"]["fullAnswer"].values()]
    # add all the image if the question refers to the whole scene
    if any(("scene" in c) for c in question["semantic"]):
        regions.append((0, 0, 1, 1))

    # prepare attention map
    if args.objectFeatures:
        cells = [((x0, y0, x1, y1), attention) for x0, y0, x1, y1, attention in cells]
    else:
        cells = [(getCell(i, j), attentionMap[i][j]) for i in range(args.mapSize) for j in range(args.mapSize)]

    # compare attention map to gold regions
    scores = []
    for region in regions:
        for cell, attention in cells:
            scores.append(attention * intersectionRate(cell, region))
    return sum(scores)

##### Functions for distribution score
##########################################################################################

# Compute chi square statistic of gold distribution vs predicted distribution,
# averaged over all question groups
def chiSquare(goldDist, predictedDist):
    sumScore, sumOverall = 0, 0

    for group in goldDist:
        score, overall = 0, 0

        for ans in goldDist[group]:
            e = goldDist[group][ans]
            o = predictedDist[group].get(ans, 0)
            score += ((float(o - e) ** 2) / e)
            overall += goldDist[group][ans]

        sumScore += score * overall
        sumOverall += overall

    avgScore = float(sumScore) / sumOverall

    return avgScore


##### Main score computation
##########################################################################################

# Loop over the questions and compute mterics
for qid, question in tqdm(questions.items()):
    gold = question["answer"]
    predicted = predictions[qid]

    correct = (predicted == gold)
    score = toScore(correct)

    wordsNum = getWordsNum(question)
    stepsNum = getStepsNum(question)

    # Compute scores over the balanced dataset (more robust against cheating by making educated guesses)
    if question["isBalanced"]:
        # Update accuracy
        scores["accuracy"].append(score)
        scores["accuracyPerLength"][wordsNum].append(score)
        scores["accuracyPerSteps"][stepsNum].append(score)
        scores["accuracyPerStructuralType"][question["types"]["structural"]].append(score)
        scores["accuracyPerSemanticType"][question["types"]["semantic"]].append(score)
        answerType = "open" if question["types"]["structural"] == "query" else "binary"
        scores[answerType].append(score)

        # # Update validity score
        # valid = belongs(predicted, choices[qid]["valid"], question)
        # scores["validity"].append(toScore(valid))

        # # Update plausibility score
        # plausible = belongs(predicted, choices[qid]["plausible"], question)
        # scores["plausibility"].append(toScore(plausible))

        # Optionally compute grounding (attention) score
        # if attentions is not None:
        #     groundingScore = computeGroundingScore(question, scenes[question["imageId"]], attentions[qid])
        #     if groundingScore is not None:
        #         scores["grounding"].append(groundingScore)

        # Update histograms for gold and predicted answers
        globalGroup = question["groups"]["global"]
        if globalGroup is not None:
            dist["gold"][globalGroup][gold] += 1
            dist["predicted"][globalGroup][predicted] += 1

        # Compute consistency (for entailed questions)
        # updateConsistency(qid, question, questions)

# Compute distribution score
scores["distribution"] = chiSquare(dist["gold"], dist["predicted"]) / 100

# Average scores over all questions (in the balanced dataset) and print scores

metrics = [
    "binary",
    "open",
    "accuracy",
    "consistency",
    "validity",
    "plausibility",
    "grounding",
    "distribution"
]

detailedMetrics = [
    ("accuracyPerStructuralType", "Accuracy / structural type"),
    ("accuracyPerSemanticType", "Accuracy / semantic type"),
    ("accuracyPerSteps", "Accuracy / steps number"),
    ("accuracyPerLength", "Accuracy / words number")
]

subMetrics = {
    "attr": "attribute",
    "cat": "category",
    "global": "scene",
    "obj": "object",
    "rel": "relation"
}
# average
for k in metrics:
    if isinstance(scores[k], list):
        scores[k] = avg(scores[k]) * 100

for k, _ in detailedMetrics:
    for t in scores[k]:
        scores[k][t] = avg(scores[k][t]) * 100, len(scores[k][t])

# print
print("")
for m in metrics:
    # skip grounding and consistency scores if not requested
    if m == "grounding" and not args.grounding:
        continue
    if m == "consistency" and not args.consistency:
        continue

    # print score
    print("{title}: {score:.2f}{suffix}".format(title = m.capitalize(), score = scores[m],
        suffix = " (lower is better)" if m == "distribution" else "%"))

for m, mPrintName in detailedMetrics:
    print("")
    # print metric title
    print("{}:".format(mPrintName))

    for t in sorted(list(scores[m].keys())):
        # set sub-metric title
        tName = t
        if isinstance(scores[k], list):
            tName = subMetrics.get(t, t).capitalize()

        # print score
        print("  {title}: {score:.2f}{suffix} ({amount} questions)".format(title = tName,
            score = scores[m][t][0], suffix = "%", amount = scores[m][t][1]))

## 2_diff
diff official/eval.py eval.py
77c77
< parser.add_argument('--questions',      default="{tier}_all_questions.json", type = str,    help = "Questions file name format.")
---
> parser.add_argument('--questions',      default="{tier}_questions.json", type = str,    help = "Questions file name format.")
119,120c119,120
< print("Loading scene graphs...")
< scenes = loadFile(args.scenes.format(tier = args.tier))
---
> # print("Loading scene graphs...")
> # scenes = loadFile(args.scenes.format(tier = args.tier))
126,128c126,128
< # Load choices
< print("Loading choices...")
< choices = loadFile(args.choices.format(tier = args.tier))
---
> # # Load choices
> # print("Loading choices...")
> # choices = loadFile(args.choices.format(tier = args.tier))
367,373c367,373
<         # Update validity score
<         valid = belongs(predicted, choices[qid]["valid"], question)
<         scores["validity"].append(toScore(valid))
<
<         # Update plausibility score
<         plausible = belongs(predicted, choices[qid]["plausible"], question)
<         scores["plausibility"].append(toScore(plausible))
---
>         # # Update validity score
>         # valid = belongs(predicted, choices[qid]["valid"], question)
>         # scores["validity"].append(toScore(valid))
>
>         # # Update plausibility score
>         # plausible = belongs(predicted, choices[qid]["plausible"], question)
>         # scores["plausibility"].append(toScore(plausible))
376,379c376,379
<         if attentions is not None:
<             groundingScore = computeGroundingScore(question, scenes[question["imageId"]], attentions[qid])
<             if groundingScore is not None:
<                 scores["grounding"].append(groundingScore)
---
>         # if attentions is not None:
>         #     groundingScore = computeGroundingScore(question, scenes[question["imageId"]], attentions[qid])
>         #     if groundingScore is not None:
>         #         scores["grounding"].append(groundingScore)
388c388
<         updateConsistency(qid, question, questions)
---
>         # updateConsistency(qid, question, questions)
	# Evaluation code for GQA.
	# Computes a suite of metrics such as accuracy, consistency, plausibility and scores per question type and length.
	# Visit https://gqadataset.org/ for all information about the dataset, including examples, visualizations, paper and slides.
	#
	#
	# Metrics:
	# - Accuracy: Standard accuracy, computed over the balanced version of the dataset, which is more robust against
	# cheating by making educated guesses. For each question-answer pair (q,a), we give 1 point if the
	# predicted answer p matches a and 0 otherwise, and average over all questions in the dataset.
	#
	# - Consistency: A metric for the level of model's consistency across different questions. For each question-answer
	# pair (q,a), we define a set Eq={q1, q2, ..., qn} of entailed questions, the answers to which can
	# be unambiguously inferred given (q,a).
	# Denote Q the set of all questions the model answered correctly. For each question q in Q, we
	# measure the model's accuracy over the entailed questions Eq to get the score sq and finally
	# average these results across all questions in Q.
	#
	# - Validity: Measures whether the model gives a "valid" answer - one that can theoretically be an answer
	# to the question (e.g. a color to a color question, yes/no to a binary question etc.).
	# We provide a set of valid answers to each questions over the final answer vocabulary, in
	# the choices file, and use it to compute average validity across the dataset.
	#
	# - Plausibility: Measures whether the model answers are plausible, e.g. one that make sense in the real world,
	# e.g. not answering "purple" to a question about apple color (unless it's really purple).
	# We provide a set of all plausible answers to each questions, computed by looking at all
	# attributes and relations hold for various objects throughout the whole dataset scene graphs,
	# and use it to compute average model plausibility across the data.
	#
	# - Grounding: Only for attention models. Measures whether the model looks at the relevant regions in the
	# image when answering a question. Each question in the dataset is annotated with the visual regions
	# they refer to, which are then used to compute the level to which the model has a correct visual attention,
	# which will allow to identify whether it really answers based on the image of by language-based guesses.
	# Supports both spatial features and object-based features.
	#
	# - Distribution: Measures the overall match between the true answer distribution for different questions,
	# vs the overall distribution predicted by the model through its answers for all the data.
	# We use chi-square statistic to measure the degree of similarity between the distributions,
	# giving indication to the level of overall world-knowledge of the model
	#
	# - Accuracy per type: accuracy per question structural types (logic, compare, choose), and semantic type
	# (questions about attributes, relations, categories, objects or the whole scene).
	#
	# - Accuracy for length: accuracy as a function of the question length, in terms of (1) words number, and semantic
	# complexity - number of reasoning steps.
	#
	# We may support additional metrics (e.g. coverage) in the future.
	#
	#
	# Files format:
	# - predictions file format: JSON array: [{"questionId": str, "prediction": str}]
	# - attentions file format: JSON array:
	# Spatial attention: [{"questionId": str, "attention": [mapSize x mapSize: float] }].
	# Object-based attention:[{"questionId": str, "attention": [[x0, y0, x1, y1, float] x #regions] }]. 0 < x,y < 1.
	# - questions and choices files are provided as part of the dataset.
	# see https://gqadataset.org/download.html for information about their format.
	#
	#
	# If you have any questions or comments, please feel free to send an email,
	# at dorarad@cs.stanford.edu. We hope you'll enjoy using the GQA dataset! :)
	#
	#

	from collections import defaultdict
	from tqdm import tqdm
	import argparse
	import os.path
	import glob
	import json
	import math

	##### Arguments
	##########################################################################################

	parser = argparse.ArgumentParser()
	parser.add_argument('--tier', default = "val", type = str, help = "Tier, e.g. train, val")
	parser.add_argument('--scenes', default="{tier}_sceneGraphs.json", type = str, help = "Scene graphs file name format.")
	parser.add_argument('--questions', default="{tier}_questions.json", type = str, help = "Questions file name format.")
	parser.add_argument('--choices', default="{tier}_choices.json", type = str, help = "Choices file name format.")
	parser.add_argument('--predictions', default="{tier}_predictions.json", type = str, help = "Answers file name format.")
	parser.add_argument('--attentions', default="{tier}_attentions.json", type = str, help = "Attentions file name format.")
	parser.add_argument('--consistency', action="store_true", help = "True to compute consistency score (Need to provide answers to questions in val_all_questions.json).")
	parser.add_argument('--grounding', action="store_true", help = "True to compute grounding score (If model uses attention).")
	parser.add_argument('--objectFeatures', action="store_true", help = "True for object-based attention (False for spatial).")
	parser.add_argument('--mapSize', default = 7, type = int, help = "Optional, only to get attention score. Images features map size, mapSize * mapSize")
	args = parser.parse_args()

	print("Please make sure to use our provided visual features as gqadataset.org for better comparability. We provide both spatial and object-based features trained on GQA train set.")
	print("In particular please avoid using features from https://github.com/peteanderson80/bottom-up-attention since they were trained on images contained in the GQA validation set and thus may give false scores improvement.\n")

	if not args.consistency:
	print("Please consider using --consistency to compute consistency scores for entailed questions.")
	print("If you do so, please provide answers to all questions in val_all_questions.json.\n")

	if not args.grounding:
	print("Please consider using --grounding to compute attention scores.")
	print("If you do so, please provide attention maps through --attentions.\n")


	##### Files Loading
	##########################################################################################

	def loadFile(name):
	# load standard json file
	if os.path.isfile(name):
	with open(name) as file:
	data = json.load(file)
	# load file chunks if too big
	elif os.path.isdir(name.split(".")[0]):
	data = {}
	chunks = glob.glob('{dir}/{dir}_*.{ext}'.format(dir = name.split(".")[0], ext = name.split(".")[1]))
	for chunk in chunks:
	with open(chunk) as file:
	data.update(json.load(file))
	else:
	raise Exception("Can't find {}".format(name))
	return data

	# Load scene graphs
	# print("Loading scene graphs...")
	# scenes = loadFile(args.scenes.format(tier = args.tier))

	# Load questions
	print("Loading questions...")
	questions = loadFile(args.questions.format(tier = args.tier))

	# # Load choices
	# print("Loading choices...")
	# choices = loadFile(args.choices.format(tier = args.tier))

	# Load predictions and turn them into a dictionary
	print("Loading predictions...")
	predictions = loadFile(args.predictions.format(tier = args.tier))
	predictions = {p["questionId"]: p["prediction"] for p in predictions}

	# Make sure all question have predictions
	for qid in questions:
	if (qid not in predictions) and (args.consistency or questions[qid]["isBalanced"]):
	print("no prediction for question {}. Please add prediction for all questions.".format(qid))
	raise Exception("missing predictions")

	# Load attentions and turn them into a dictionary
	attentions = None
	if args.grounding:
	with open(args.attentions.format(tier = args.tier)) as attentionsFile:
	attentions = json.load(attentionsFile)
	attentions = {a["questionId"]: a["attention"] for a in attentions}

	##### Scores data structures initialization
	##########################################################################################

	# book to float
	def toScore(b):
	return float(1 if b else 0)

	# Compute average of a list
	def avg(l):
	if len(l) == 0:
	return 0
	return float(sum(l)) / len(l)

	def wavg(l, w):
	if sum(w) == 0:
	return None
	return float(sum(l[i] * w[i] for i in range(len(l)))) / sum(w)

	# Initialize data structure to track all metrics: e.g. accuracy, validity and plausibility, as well as
	# accuracy per question type, length and number of reasoning steps.
	scores = {
	"accuracy": [], # list of accuracies per question (1 if correct else 0). Will be averaged ultimately.
	"binary": [], # list of accuracies per a binary question (1 if correct else 0). Will be averaged ultimately.
	"open": [], # list of accuracies per an open question (1 if correct else 0). Will be averaged ultimately.
	"validity": [], # list of validity per question (1 if valid else 0).
	"plausibility": [], # list of plausibility per question (1 if plausible else 0).
	"consistency": [], # list of consistency scores for entailed questions.
	"accuracyPerStructuralType": defaultdict(list), # list of question accuracies for each structural type (e.g. compare, logic questions).
	"accuracyPerSemanticType": defaultdict(list), # list of question accuracies for each semantic type (e.g. questions about an object, an attribute, a relation).
	"accuracyPerLength": defaultdict(list), # list of question accuracies per question's word number.
	"accuracyPerSteps": defaultdict(list), # list of question accuracies per question's reasoning length (steps number).
	"grounding": [] # list of grounding scores for each question.
	}

	# Initialize golden and predicted histograms per each question group. Used to compute the distribution metric.
	dist = {
	"gold": defaultdict(lambda: defaultdict(int)),
	"predicted": defaultdict(lambda: defaultdict(int))
	}


	##### Question lengths - words numbers and reasoning steps number
	##########################################################################################

	# Compute question length (words number)
	def getWordsNum(question):
	return len(question["question"].split())

	# Compute number of reasoning steps (excluding the final "querying" step which doesn't increase effective reasoning length)
	def getStepsNum(question):
	return len([c for c in question["semantic"] if not (any([o in "{}: {}".format(c["operation"], c["argument"])
	for o in ["exist", "query: name", "choose name"]]))])


	##### Functions for question annotations
	##########################################################################################

	# Utility function for converting question annotations string keys to slices
	def toSlice(strSlice):
	sliceLims = (int(n) for n in strSlice.split(':'))
	return apply(slice, sliceLims)

	# Utility function for converting question annotations string keys to indexes list:
	# "1" => [0]
	# "1:3" => [1, 2]
	# "4:9:2" => [4, 6, 8]
	def intsFromSlice(strSlice):
	slice_obj = get_slice_obj(slicearg)
	return(range(slice_obj.start or 0, slice_obj.stop or -1, slice_obj.step or 1))

	##### Functions for validity and plausibility
	##########################################################################################

	def belongs(element, group, question):
	# normalization ()
	if "Common" in question["types"]["detailed"]:
	group = ["color", "material", "shape"]

	return element in group

	##### Functions for consistency scores (for entailed questions ("inferred"))
	##########################################################################################

	def updateConsistency(questionId, question, questions):
	inferredQuestions = [eid for eid in question["entailed"] if eid != questionId]

	if correct and len(inferredQuestions) > 0:

	cosnsitencyScores = []
	for eid in inferredQuestions:
	gold = questions[eid]["answer"]
	predicted = predictions[eid]
	score = toScore(predicted == gold)
	cosnsitencyScores.append(score)

	scores["consistency"].append(avg(cosnsitencyScores))

	##### Functions for grounding score (optional, only for attention models)
	##########################################################################################

	# Utility functions for working with bounding boxes.
	# c = (x0, y0, x1, y1), r = (r0, r1)

	def yrange(c):
	return (c[1], c[3])

	def xrange(c):
	return (c[0], c[2])

	def length(r):
	if r is None:
	return 0
	return float(r[1] - r[0])

	def size(c):
	return length(xrange(c)) * length(yrange(c))

	def intersection(r1, r2):
	ir = (max(r1[0], r2[0]), min(r1[1], r2[1]))
	if ir[1] > ir[0]:
	return ir
	return None

	def intersectionSize(c1, c2):
	return length(intersection(xrange(c1), xrange(c2))) * length(intersection(yrange(c1), yrange(c2)))

	def intersectionRate(c1, c2):
	return float(intersectionSize(c1, c2)) / size(c1)

	# Get spatial cell
	def getCell(i, j):
	edge = float(1) / args.mapSize
	return (edge * i, edge * j, edge * (i + 1), edge * (j + 1))

	# Get bounding box of objectId in sceneGraph
	def getRegion(sceneGraph, objectId):
	obj = sceneGraph["objects"][objectId]
	x0 = float(obj["x"]) / sceneGraph["width"]
	y0 = float(obj["y"]) / sceneGraph["height"]
	x1 = float(obj["x"] + obj["w"]) / sceneGraph["width"]
	y1 = float(obj["y"] + obj["h"]) / sceneGraph["height"]
	return (x0, y0, x1, y1)

	# Compute grounding score. Computer amount of attention (probability) given to each of the regions
	# the question and answers refer to.
	def computeGroundingScore(question, sceneGraph, attentionMap):
	## prepare gold regions
	regions = []
	# add question regions
	regions += [getRegion(sceneGraph, pointer) for pointer in question["annotations"]["question"].values()]
	# add answer regions
	regions += [getRegion(sceneGraph, pointer) for pointer in question["annotations"]["fullAnswer"].values()]
	# add all the image if the question refers to the whole scene
	if any(("scene" in c) for c in question["semantic"]):
	regions.append((0, 0, 1, 1))

	# prepare attention map
	if args.objectFeatures:
	cells = [((x0, y0, x1, y1), attention) for x0, y0, x1, y1, attention in cells]
	else:
	cells = [(getCell(i, j), attentionMap[i][j]) for i in range(args.mapSize) for j in range(args.mapSize)]

	# compare attention map to gold regions
	scores = []
	for region in regions:
	for cell, attention in cells:
	scores.append(attention * intersectionRate(cell, region))
	return sum(scores)

	##### Functions for distribution score
	##########################################################################################

	# Compute chi square statistic of gold distribution vs predicted distribution,
	# averaged over all question groups
	def chiSquare(goldDist, predictedDist):
	sumScore, sumOverall = 0, 0

	for group in goldDist:
	score, overall = 0, 0

	for ans in goldDist[group]:
	e = goldDist[group][ans]
	o = predictedDist[group].get(ans, 0)
	score += ((float(o - e) ** 2) / e)
	overall += goldDist[group][ans]

	sumScore += score * overall
	sumOverall += overall

	avgScore = float(sumScore) / sumOverall

	return avgScore


	##### Main score computation
	##########################################################################################

	# Loop over the questions and compute mterics
	for qid, question in tqdm(questions.items()):
	gold = question["answer"]
	predicted = predictions[qid]

	correct = (predicted == gold)
	score = toScore(correct)

	wordsNum = getWordsNum(question)
	stepsNum = getStepsNum(question)

	# Compute scores over the balanced dataset (more robust against cheating by making educated guesses)
	if question["isBalanced"]:
	# Update accuracy
	scores["accuracy"].append(score)
	scores["accuracyPerLength"][wordsNum].append(score)
	scores["accuracyPerSteps"][stepsNum].append(score)
	scores["accuracyPerStructuralType"][question["types"]["structural"]].append(score)
	scores["accuracyPerSemanticType"][question["types"]["semantic"]].append(score)
	answerType = "open" if question["types"]["structural"] == "query" else "binary"
	scores[answerType].append(score)

	# # Update validity score
	# valid = belongs(predicted, choices[qid]["valid"], question)
	# scores["validity"].append(toScore(valid))

	# # Update plausibility score
	# plausible = belongs(predicted, choices[qid]["plausible"], question)
	# scores["plausibility"].append(toScore(plausible))

	# Optionally compute grounding (attention) score
	# if attentions is not None:
	# groundingScore = computeGroundingScore(question, scenes[question["imageId"]], attentions[qid])
	# if groundingScore is not None:
	# scores["grounding"].append(groundingScore)

	# Update histograms for gold and predicted answers
	globalGroup = question["groups"]["global"]
	if globalGroup is not None:
	dist["gold"][globalGroup][gold] += 1
	dist["predicted"][globalGroup][predicted] += 1

	# Compute consistency (for entailed questions)
	# updateConsistency(qid, question, questions)

	# Compute distribution score
	scores["distribution"] = chiSquare(dist["gold"], dist["predicted"]) / 100

	# Average scores over all questions (in the balanced dataset) and print scores

	metrics = [
	"binary",
	"open",
	"accuracy",
	"consistency",
	"validity",
	"plausibility",
	"grounding",
	"distribution"
	]

	detailedMetrics = [
	("accuracyPerStructuralType", "Accuracy / structural type"),
	("accuracyPerSemanticType", "Accuracy / semantic type"),
	("accuracyPerSteps", "Accuracy / steps number"),
	("accuracyPerLength", "Accuracy / words number")
	]

	subMetrics = {
	"attr": "attribute",
	"cat": "category",
	"global": "scene",
	"obj": "object",
	"rel": "relation"
	}
	# average
	for k in metrics:
	if isinstance(scores[k], list):
	scores[k] = avg(scores[k]) * 100

	for k, _ in detailedMetrics:
	for t in scores[k]:
	scores[k][t] = avg(scores[k][t]) * 100, len(scores[k][t])

	# print
	print("")
	for m in metrics:
	# skip grounding and consistency scores if not requested
	if m == "grounding" and not args.grounding:
	continue
	if m == "consistency" and not args.consistency:
	continue

	# print score
	print("{title}: {score:.2f}{suffix}".format(title = m.capitalize(), score = scores[m],
	suffix = " (lower is better)" if m == "distribution" else "%"))

	for m, mPrintName in detailedMetrics:
	print("")
	# print metric title
	print("{}:".format(mPrintName))

	for t in sorted(list(scores[m].keys())):
	# set sub-metric title
	tName = t
	if isinstance(scores[k], list):
	tName = subMetrics.get(t, t).capitalize()

	# print score
	print(" {title}: {score:.2f}{suffix} ({amount} questions)".format(title = tName,
	score = scores[m][t][0], suffix = "%", amount = scores[m][t][1]))
	diff official/eval.py eval.py
	77c77
	< parser.add_argument('--questions', default="{tier}_all_questions.json", type = str, help = "Questions file name format.")
	---
	> parser.add_argument('--questions', default="{tier}_questions.json", type = str, help = "Questions file name format.")
	119,120c119,120
	< print("Loading scene graphs...")
	< scenes = loadFile(args.scenes.format(tier = args.tier))
	---
	> # print("Loading scene graphs...")
	> # scenes = loadFile(args.scenes.format(tier = args.tier))
	126,128c126,128
	< # Load choices
	< print("Loading choices...")
	< choices = loadFile(args.choices.format(tier = args.tier))
	---
	> # # Load choices
	> # print("Loading choices...")
	> # choices = loadFile(args.choices.format(tier = args.tier))
	367,373c367,373
	< # Update validity score
	< valid = belongs(predicted, choices[qid]["valid"], question)
	< scores["validity"].append(toScore(valid))
	<
	< # Update plausibility score
	< plausible = belongs(predicted, choices[qid]["plausible"], question)
	< scores["plausibility"].append(toScore(plausible))
	---
	> # # Update validity score
	> # valid = belongs(predicted, choices[qid]["valid"], question)
	> # scores["validity"].append(toScore(valid))
	>
	> # # Update plausibility score
	> # plausible = belongs(predicted, choices[qid]["plausible"], question)
	> # scores["plausibility"].append(toScore(plausible))
	376,379c376,379
	< if attentions is not None:
	< groundingScore = computeGroundingScore(question, scenes[question["imageId"]], attentions[qid])
	< if groundingScore is not None:
	< scores["grounding"].append(groundingScore)
	---
	> # if attentions is not None:
	> # groundingScore = computeGroundingScore(question, scenes[question["imageId"]], attentions[qid])
	> # if groundingScore is not None:
	> # scores["grounding"].append(groundingScore)
	388c388
	< updateConsistency(qid, question, questions)
	---
	> # updateConsistency(qid, question, questions)