""" Author: Mateusz Malinowski Email: mmalinow@mpi-inf.mpg.de The script assumes there are two files - first file with ground truth answers - second file with predicted answers both answers are line-aligned The script also assumes that answer items are comma separated. For instance, chair,table,window It is also a set measure, so not exactly the same as accuracy even if dirac measure is used since {book,book}=={book}, also {book,chair}={chair,book} Logs: 05.09.2015 - white spaces surrounding words are stripped away so that {book, chair}={book,chair} """ import sys #import enchant from numpy import prod from nltk.corpus import wordnet as wn from tqdm import tqdm def file2list(filepath): with open(filepath,'r') as f: lines =[k for k in [k.strip() for k in f.readlines()] if len(k) > 0] return lines def list2file(filepath,mylist): mylist='\n'.join(mylist) with open(filepath,'w') as f: f.writelines(mylist) def items2list(x): """ x - string of comma-separated answer items """ return [l.strip() for l in x.split(',')] def fuzzy_set_membership_measure(x,A,m): """ Set membership measure. x: element A: set of elements m: point-wise element-to-element measure m(a,b) ~ similarity(a,b) This function implments a fuzzy set membership measure: m(x \in A) = max_{a \in A} m(x,a)} """ return 0 if A==[] else max(map(lambda a: m(x,a), A)) def score_it(A,T,m): """ A: list of A items T: list of T items m: set membership measure m(a \in A) gives a membership quality of a into A This function implements a fuzzy accuracy score: score(A,T) = min{prod_{a \in A} m(a \in T), prod_{t \in T} m(a \in A)} where A and T are set representations of the answers and m is a measure """ if A==[] and T==[]: return 1 # print A,T score_left=0 if A==[] else prod(list(map(lambda a: m(a,T), A))) score_right=0 if T==[] else prod(list(map(lambda t: m(t,A),T))) return min(score_left,score_right) # implementations of different measure functions def dirac_measure(a,b): """ Returns 1 iff a=b and 0 otherwise. """ if a==[] or b==[]: return 0.0 return float(a==b) def wup_measure(a,b,similarity_threshold=0.925): """ Returns Wu-Palmer similarity score. More specifically, it computes: max_{x \in interp(a)} max_{y \in interp(b)} wup(x,y) where interp is a 'interpretation field' """ def get_semantic_field(a): weight = 1.0 semantic_field = wn.synsets(a,pos=wn.NOUN) return (semantic_field,weight) def get_stem_word(a): """ Sometimes answer has form word\d+:wordid. If so we return word and downweight """ weight = 1.0 return (a,weight) global_weight=1.0 (a,global_weight_a)=get_stem_word(a) (b,global_weight_b)=get_stem_word(b) global_weight = min(global_weight_a,global_weight_b) if a==b: # they are the same return 1.0*global_weight if a==[] or b==[]: return 0 interp_a,weight_a = get_semantic_field(a) interp_b,weight_b = get_semantic_field(b) if interp_a == [] or interp_b == []: return 0 # we take the most optimistic interpretation global_max=0.0 for x in interp_a: for y in interp_b: local_score=x.wup_similarity(y) if local_score > global_max: global_max=local_score # we need to use the semantic fields and therefore we downweight # unless the score is high which indicates both are synonyms if global_max < similarity_threshold: interp_weight = 0.1 else: interp_weight = 1.0 final_score=global_max*weight_a*weight_b*interp_weight*global_weight return final_score ### def get_scores(input_gt, input_pred, threshold_0=0.0, threshold_1=0.9): element_membership_acc=dirac_measure element_membership_wups_0=lambda x,y: wup_measure(x,y,threshold_0) element_membership_wups_1=lambda x,y: wup_measure(x,y,threshold_1) set_membership_acc=\ lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_acc) set_membership_wups_0=\ lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_wups_0) set_membership_wups_1=\ lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_wups_1) score_list_acc = [] score_list_wups_0 = [] score_list_wups_1 = [] pbar = tqdm(zip(input_gt,input_pred)) pbar.set_description('Computing Acc') for (ta,pa) in pbar: score_list_acc.append(score_it(items2list(ta),items2list(pa),set_membership_acc)) #final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list)) final_score_acc=float(sum(score_list_acc))/float(len(score_list_acc)) final_score_acc *= 100.0 pbar = tqdm(zip(input_gt,input_pred)) pbar.set_description('Computing Wups_0.0') for (ta,pa) in pbar: score_list_wups_0.append(score_it(items2list(ta),items2list(pa),set_membership_wups_0)) #final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list)) final_score_wups_0=float(sum(score_list_wups_0))/float(len(score_list_wups_0)) final_score_wups_0 *= 100.0 pbar = tqdm(zip(input_gt,input_pred)) pbar.set_description('Computing Wups_0.9') for (ta,pa) in pbar: score_list_wups_1.append(score_it(items2list(ta),items2list(pa),set_membership_wups_1)) #final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list)) final_score_wups_1=float(sum(score_list_wups_1))/float(len(score_list_wups_1)) final_score_wups_1 *= 100.0 # filtering to obtain the results #print 'full score:', score_list # print('accuracy = {0:.2f} | WUPS@{1} = {2:.2f} | WUPS@{3} = {4:.2f}'.format( # final_score_acc, threshold_0, final_score_wups_0, threshold_1, final_score_wups_1)) return final_score_acc, final_score_wups_0, final_score_wups_1 def get_acc(gts, preds): sum_correct = 0 assert len(gts) == len(preds) for gt, pred in zip(gts, preds): if gt == pred: sum_correct += 1 acc = 100.0 * float(sum_correct/ len(gts)) return acc