212 lines
6.2 KiB
Python
212 lines
6.2 KiB
Python
|
"""
|
||
|
Author: Mateusz Malinowski
|
||
|
Email: mmalinow@mpi-inf.mpg.de
|
||
|
|
||
|
The script assumes there are two files
|
||
|
- first file with ground truth answers
|
||
|
- second file with predicted answers
|
||
|
both answers are line-aligned
|
||
|
|
||
|
The script also assumes that answer items are comma separated.
|
||
|
For instance, chair,table,window
|
||
|
|
||
|
It is also a set measure, so not exactly the same as accuracy
|
||
|
even if dirac measure is used since {book,book}=={book}, also {book,chair}={chair,book}
|
||
|
|
||
|
Logs:
|
||
|
05.09.2015 - white spaces surrounding words are stripped away so that {book, chair}={book,chair}
|
||
|
"""
|
||
|
|
||
|
import sys
|
||
|
|
||
|
#import enchant
|
||
|
|
||
|
from numpy import prod
|
||
|
from nltk.corpus import wordnet as wn
|
||
|
from tqdm import tqdm
|
||
|
|
||
|
def file2list(filepath):
|
||
|
with open(filepath,'r') as f:
|
||
|
lines =[k for k in
|
||
|
[k.strip() for k in f.readlines()]
|
||
|
if len(k) > 0]
|
||
|
|
||
|
return lines
|
||
|
|
||
|
|
||
|
def list2file(filepath,mylist):
|
||
|
mylist='\n'.join(mylist)
|
||
|
with open(filepath,'w') as f:
|
||
|
f.writelines(mylist)
|
||
|
|
||
|
|
||
|
def items2list(x):
|
||
|
"""
|
||
|
x - string of comma-separated answer items
|
||
|
"""
|
||
|
return [l.strip() for l in x.split(',')]
|
||
|
|
||
|
|
||
|
def fuzzy_set_membership_measure(x,A,m):
|
||
|
"""
|
||
|
Set membership measure.
|
||
|
x: element
|
||
|
A: set of elements
|
||
|
m: point-wise element-to-element measure m(a,b) ~ similarity(a,b)
|
||
|
|
||
|
This function implments a fuzzy set membership measure:
|
||
|
m(x \in A) = max_{a \in A} m(x,a)}
|
||
|
"""
|
||
|
return 0 if A==[] else max(map(lambda a: m(x,a), A))
|
||
|
|
||
|
|
||
|
def score_it(A,T,m):
|
||
|
"""
|
||
|
A: list of A items
|
||
|
T: list of T items
|
||
|
m: set membership measure
|
||
|
m(a \in A) gives a membership quality of a into A
|
||
|
|
||
|
This function implements a fuzzy accuracy score:
|
||
|
score(A,T) = min{prod_{a \in A} m(a \in T), prod_{t \in T} m(a \in A)}
|
||
|
where A and T are set representations of the answers
|
||
|
and m is a measure
|
||
|
"""
|
||
|
if A==[] and T==[]:
|
||
|
return 1
|
||
|
|
||
|
# print A,T
|
||
|
|
||
|
score_left=0 if A==[] else prod(list(map(lambda a: m(a,T), A)))
|
||
|
score_right=0 if T==[] else prod(list(map(lambda t: m(t,A),T)))
|
||
|
return min(score_left,score_right)
|
||
|
|
||
|
|
||
|
# implementations of different measure functions
|
||
|
def dirac_measure(a,b):
|
||
|
"""
|
||
|
Returns 1 iff a=b and 0 otherwise.
|
||
|
"""
|
||
|
if a==[] or b==[]:
|
||
|
return 0.0
|
||
|
return float(a==b)
|
||
|
|
||
|
|
||
|
def wup_measure(a,b,similarity_threshold=0.925):
|
||
|
"""
|
||
|
Returns Wu-Palmer similarity score.
|
||
|
More specifically, it computes:
|
||
|
max_{x \in interp(a)} max_{y \in interp(b)} wup(x,y)
|
||
|
where interp is a 'interpretation field'
|
||
|
"""
|
||
|
def get_semantic_field(a):
|
||
|
weight = 1.0
|
||
|
semantic_field = wn.synsets(a,pos=wn.NOUN)
|
||
|
return (semantic_field,weight)
|
||
|
|
||
|
|
||
|
def get_stem_word(a):
|
||
|
"""
|
||
|
Sometimes answer has form word\d+:wordid.
|
||
|
If so we return word and downweight
|
||
|
"""
|
||
|
weight = 1.0
|
||
|
return (a,weight)
|
||
|
|
||
|
|
||
|
global_weight=1.0
|
||
|
|
||
|
(a,global_weight_a)=get_stem_word(a)
|
||
|
(b,global_weight_b)=get_stem_word(b)
|
||
|
global_weight = min(global_weight_a,global_weight_b)
|
||
|
|
||
|
if a==b:
|
||
|
# they are the same
|
||
|
return 1.0*global_weight
|
||
|
|
||
|
if a==[] or b==[]:
|
||
|
return 0
|
||
|
|
||
|
|
||
|
interp_a,weight_a = get_semantic_field(a)
|
||
|
interp_b,weight_b = get_semantic_field(b)
|
||
|
|
||
|
if interp_a == [] or interp_b == []:
|
||
|
return 0
|
||
|
|
||
|
# we take the most optimistic interpretation
|
||
|
global_max=0.0
|
||
|
for x in interp_a:
|
||
|
for y in interp_b:
|
||
|
local_score=x.wup_similarity(y)
|
||
|
if local_score > global_max:
|
||
|
global_max=local_score
|
||
|
|
||
|
# we need to use the semantic fields and therefore we downweight
|
||
|
# unless the score is high which indicates both are synonyms
|
||
|
if global_max < similarity_threshold:
|
||
|
interp_weight = 0.1
|
||
|
else:
|
||
|
interp_weight = 1.0
|
||
|
|
||
|
final_score=global_max*weight_a*weight_b*interp_weight*global_weight
|
||
|
return final_score
|
||
|
###
|
||
|
|
||
|
|
||
|
def get_scores(input_gt, input_pred, threshold_0=0.0, threshold_1=0.9):
|
||
|
element_membership_acc=dirac_measure
|
||
|
element_membership_wups_0=lambda x,y: wup_measure(x,y,threshold_0)
|
||
|
element_membership_wups_1=lambda x,y: wup_measure(x,y,threshold_1)
|
||
|
|
||
|
set_membership_acc=\
|
||
|
lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_acc)
|
||
|
set_membership_wups_0=\
|
||
|
lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_wups_0)
|
||
|
set_membership_wups_1=\
|
||
|
lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_wups_1)
|
||
|
|
||
|
score_list_acc = []
|
||
|
score_list_wups_0 = []
|
||
|
score_list_wups_1 = []
|
||
|
pbar = tqdm(zip(input_gt,input_pred))
|
||
|
pbar.set_description('Computing Acc')
|
||
|
|
||
|
for (ta,pa) in pbar:
|
||
|
score_list_acc.append(score_it(items2list(ta),items2list(pa),set_membership_acc))
|
||
|
|
||
|
#final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list))
|
||
|
final_score_acc=float(sum(score_list_acc))/float(len(score_list_acc))
|
||
|
final_score_acc *= 100.0
|
||
|
|
||
|
pbar = tqdm(zip(input_gt,input_pred))
|
||
|
pbar.set_description('Computing Wups_0.0')
|
||
|
for (ta,pa) in pbar:
|
||
|
score_list_wups_0.append(score_it(items2list(ta),items2list(pa),set_membership_wups_0))
|
||
|
#final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list))
|
||
|
final_score_wups_0=float(sum(score_list_wups_0))/float(len(score_list_wups_0))
|
||
|
final_score_wups_0 *= 100.0
|
||
|
|
||
|
pbar = tqdm(zip(input_gt,input_pred))
|
||
|
pbar.set_description('Computing Wups_0.9')
|
||
|
for (ta,pa) in pbar:
|
||
|
score_list_wups_1.append(score_it(items2list(ta),items2list(pa),set_membership_wups_1))
|
||
|
#final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list))
|
||
|
final_score_wups_1=float(sum(score_list_wups_1))/float(len(score_list_wups_1))
|
||
|
final_score_wups_1 *= 100.0
|
||
|
|
||
|
# filtering to obtain the results
|
||
|
#print 'full score:', score_list
|
||
|
# print('accuracy = {0:.2f} | WUPS@{1} = {2:.2f} | WUPS@{3} = {4:.2f}'.format(
|
||
|
# final_score_acc, threshold_0, final_score_wups_0, threshold_1, final_score_wups_1))
|
||
|
return final_score_acc, final_score_wups_0, final_score_wups_1
|
||
|
|
||
|
def get_acc(gts, preds):
|
||
|
sum_correct = 0
|
||
|
assert len(gts) == len(preds)
|
||
|
for gt, pred in zip(gts, preds):
|
||
|
if gt == pred:
|
||
|
sum_correct += 1
|
||
|
acc = 100.0 * float(sum_correct/ len(gts))
|
||
|
return acc
|