-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyze.py
executable file
·170 lines (129 loc) · 5.48 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python
######################################################################
# The following is an analysis tool for the output of the results of
# a consistency analysis on a corpus. There are a few requirements
# that must be met in order to ensure accurate results. There must be
# a newline between the nuclei variation occurences, there must be a
# tab character before the annoation, 'y', 'n', or '?', the file
# must end in a newline to ensure the last annotation is counted.
#
# Examples of properly formatted annotations can be found in the
# repo's README.
######################################################################
from __future__ import division
from collections import defaultdict
from recordclass import recordclass
import re
import sys
from lib.annotation import Annotation
VariationCountInternal = recordclass('VariationCountInternal', ['correct', 'incorrect', 'unmarked'])
class VariationCount(VariationCountInternal):
def one_marked(self):
return self.correct > 0 or self.incorrect > 0
def annotated_count(self):
return self.correct + self.incorrect
def percent_incorrect(self):
return self.incorrect / self.annotated_count() * 100
DEP_FLAG_ARG = ('--dep', '-d')
LEMMA_FLAG_ARG = ('--lemma', '-l')
FREQ_FLAG_ARG = ('--frequency', '-f')
ALL_OCC_ARG = ('--all', '-a')
if len(sys.argv) < 2:
raise TypeError('Not enough arguments provided')
dep_flag = reduce(lambda acc, arg: acc or arg in DEP_FLAG_ARG, sys.argv, False)
lemma_flag = reduce(lambda acc, arg: acc or arg in LEMMA_FLAG_ARG, sys.argv, False)
freq_flag = reduce(lambda acc, arg: acc or arg in FREQ_FLAG_ARG, sys.argv, False)
all_occ_flag = reduce(lambda acc, arg: acc or arg in ALL_OCC_ARG, sys.argv, False)
total_count = 0
annotated_count = 0
filename = sys.argv[1]
ann = Annotation()
ann.from_filename(filename)
inconsistent_tokens = 0
total_tokens = 0
counted_lemma_incorrect = False
counted_lemma = False
inconsistent_lemmas = 0
annotated_lemmas = 0
by_dep = defaultdict(lambda: VariationCount(0, 0, 0))
by_lemma = defaultdict(lambda: VariationCount(0, 0, 0))
freqs = defaultdict(int)
for lemma_pair, occurrences in ann.annotations.items():
counted_lemma_incorrect = False
counted_lemma = False
num_annotated = 0
for occ in occurrences:
if not all_occ_flag:
if occ.is_annotated():
num_annotated += 1
if not counted_lemma:
counted_lemma = True
annotated_lemmas += 1
total_tokens += 1
if occ.correct_in_corpus():
by_dep[occ.dep].correct += 1
by_lemma[lemma_pair].correct += 1
else:
inconsistent_tokens += 1
if not counted_lemma_incorrect:
counted_lemma_incorrect = True
inconsistent_lemmas += 1
by_dep[occ.dep].incorrect += 1
by_lemma[lemma_pair].incorrect += 1
else:
by_dep[occ.dep].unmarked += 1
by_lemma[lemma_pair].unmarked += 1
else:
num_annotated += 1
if not counted_lemma:
counted_lemma = True
annotated_lemmas += 1
total_tokens += 1
if occ.correct_in_corpus():
by_dep[occ.dep].correct += 1
by_lemma[lemma_pair].correct += 1
else:
inconsistent_tokens += 1
if not counted_lemma_incorrect:
counted_lemma_incorrect = True
inconsistent_lemmas += 1
by_dep[occ.dep].incorrect += 1
by_lemma[lemma_pair].incorrect += 1
freqs[num_annotated] += 1
if dep_flag:
print 'Data analysis by dependency type'
print 'Format is as follows:'
print 'DIR, REL\t# incorrrect\t# total annotated\tpercent incorrect'
print
for dep, count in by_dep.items():
if count.one_marked():
print '{}, {}\t{}\t{}\t{}%'.format(dep[0], dep[1], count.incorrect, count.annotated_count(), count.percent_incorrect())
print
print
if lemma_flag:
print 'Data analysis by lemma'
print 'Format is as follows:'
print 'LEMMA1, LEMMA2\t# incorrrect\t# total annotated\tpercent incorrect'
print
for lemmas, count in by_lemma.items():
if count.one_marked():
lemma1, lemma2 = lemmas
print '{}, {}\t{}\t{}\t{}%'.format(lemma1, lemma2, count.incorrect, count.annotated_count(), count.percent_incorrect())
print
print
if freq_flag:
print 'Data analysis by freq'
print 'Format is as follows:'
print 'NUMBER\tFREQUENCY'
for num, freq in freqs.items():
print '{}\t{}'.format(num, freq)
print 'Number of inconsistencies: {}'.format(ann.size)
print 'Number of which were nil: {}'.format(ann.nils)
print 'Number of which were context: {}'.format(ann.contexts)
if total_tokens > 0:
print 'Percent of all occurences that were correct'
print '{} / {} = {}%'.format(total_tokens - inconsistent_tokens, total_tokens, (total_tokens - inconsistent_tokens) / total_tokens * 100)
print 'Percent of all occurences that were incorrect'
print '{} / {} = {}%'.format(inconsistent_tokens, total_tokens, inconsistent_tokens / total_tokens * 100)
print 'Percent of all lemma pairs with at least one incorrect occurrence'
print '{} / {} = {}%'.format(inconsistent_lemmas, annotated_lemmas, inconsistent_lemmas / annotated_lemmas * 100)