-
Notifications
You must be signed in to change notification settings - Fork 3
/
bns.py
219 lines (199 loc) · 8.83 KB
/
bns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# coding=utf-8
from collections import Counter, namedtuple
from scipy.sparse import csr_matrix
from scipy.stats import norm
import numpy as np
import math
from nlp_utils import ngrams
class BNS:
"""Bi-normal Separation is a popular method to score textual data importance against its
belonging category, it can efficiently find out important keywords in a document and assign
a weighted positive score, also provide negative scoring for unimportant word for a document.
Below are the description of variables used to calculate Bi-normal separation score for a
word for each category (or classes).
Features Descriptions:
======================
pos = number of positive training cases, typically minority,
neg = number of negative training cases,
tp = number of positive training cases containing word,
fp = number of negative training cases containing word,
fn = pos - tp,
tn = neg - fp,
true positive rate(tpr) = P(word | positive class) = tp/pos
false positive rate (fpr) = P(word | negative class) = fp/neg,
Bi-Normal Separation (BNS): = F-1(tpr) – F-1(fpr)
(F-1) is the inverse Normal cumulative distribution function
"""
def __init__(self, ngram_range=None):
self.categories = []
self.bound_min_score = 0.0005
self.bound_max_score = 1 - 0.0005
self.bns_scores = {}
self.vectors = {}
self.sentences_category_map = {}
if ngram_range is None:
self.ngram_range = [1, 2]
def bound(self, value):
"""
Bound the bns score under `bound_min_score` and `bound_max_score`
Args:
value (float): bnr score
Returns:
(float): bounded bnr score within min max limit
"""
return max(self.bound_min_score, min(self.bound_max_score, value))
@staticmethod
def calculate_bns_score(tpr, fpr):
"""
Calculate bns score for given `tpr` and `fpr` value
Args:
tpr (float): true positive rate
fpr (float): false positive rate
Returns:
(float) : bns score
"""
return norm.ppf(tpr) - norm.ppf(fpr)
def get_bns_score(self, word, category):
"""
Returns bns score for given `word` belongs to `category`
Args:
word (str): word whose bns score to be determined
category (str): category or class in which word bns score has to be find
Returns:
score (float): bns score
"""
score = None
if word in self.bns_scores:
if category in self.bns_scores[word]:
score = self.bns_scores[word][category]
return score
@staticmethod
def get_word_list(documents):
"""
Given list of sentences
Args:
documents (list): list of documents
Returns:
words (set): set of unique of words in documents
"""
words = []
for doc in documents:
words.extend(doc.split())
return set(words)
@staticmethod
def get_word_count_in_category(documents, categories):
"""
Create dict containing count of word for every category from document.
Examples:
documents - ['book cab', 'book me a taxi', 'book flight to mumbai']
categories - ['book_cab', 'book_cab', 'book_flight']
word_dict => {'book': {'book_cab': 2 , book_flight: 1}, 'cab': {'book_cab': 1},
'me': {'book_cab': 1}, 'a': {'book_cab':1 }, 'flight': {'book_flight': 1},
'to': {book_flight: 1}, 'mumbai': {'book_flight': 1}}
Args:
documents (list): list of documents
categories (list): list of category for doc in documents
Returns:
word_dict (dict): dict of word and their count in respective categories
"""
word_dict = {}
for sent, cat in zip(documents, categories):
words = sent.split()
for word in words:
if word not in word_dict:
word_dict[word] = {cat: 1}
else:
if cat not in word_dict[word]:
word_dict[word][cat] = 1
else:
word_dict[word][cat] += 1
return word_dict
def create_bns_score(self, documents, categories, word_category_count_dict):
"""
Create a dict of words and their respective bns score for each categories
Args:
documents (list): list of documents
categories (list): list category doc in documents
word_category_count_dict (dict): dict containing word and their respective count in categories
Returns:
None
"""
self.categories = list(set(categories))
total_categories = len(categories)
word_list = self.get_word_list(documents)
for sent, cat in zip(documents, categories):
if cat not in self.sentences_category_map:
self.sentences_category_map[cat] = [sent]
else:
self.sentences_category_map[cat].append(sent)
for index, word in enumerate(word_list):
for category in self.categories:
positive_sent = len(self.sentences_category_map[category])
negative_sent = total_categories - positive_sent
word_dict = word_category_count_dict[word]
total_word_occurrence = sum(word_dict.values())
if category in word_dict:
tp = word_dict[category]
else:
tp = 0
fp = total_word_occurrence - tp
tpr = self.bound(tp / float(positive_sent))
fpr = self.bound(fp / float(negative_sent))
bns_score = self.calculate_bns_score(tpr, fpr)
if not self.bns_scores.get(word, None):
self.bns_scores[word] = {'index': index, category: bns_score}
else:
if not self.bns_scores.get(word, {}).get(category, None):
self.bns_scores[word][category] = bns_score
def fit(self, training_documents, categories):
"""
Fit the documents and categories to create bns vectors for documents
Args:
training_documents (list): list of documents
categories (list): list category doc in documents
Returns:
None
"""
word_category_count_dict = self.get_word_count_in_category(training_documents, categories)
self.create_bns_score(training_documents, categories, word_category_count_dict)
for category in self.sentences_category_map.keys():
scores, indexes, counter = [], [], []
for count, sentence in enumerate(self.sentences_category_map[category]):
tokens = []
for n in self.ngram_range:
tokens.extend(ngrams(sentence, n))
tokens_dict = dict(Counter(tokens))
for token, token_count in tokens_dict.iteritems():
token_meta_data = self.bns_scores.get(token, None)
if token_meta_data:
if category in token_meta_data:
scores.append(token_count * token_meta_data[category])
indexes.append(token_meta_data['index'])
counter.append(count)
self.vectors[category] = csr_matrix((scores, (counter, indexes)),
shape=(len(counter), len(self.bns_scores)))
def transform(self, test_documents):
"""
Return bns vectors for test documents
Args:
test_documents (list): list of documents to convert them to bns vectorizer
Returns:
test_vector (list): list of bns vectors for each doc in `test_documents`
"""
test_vector = {}
for category in self.categories:
scores, indexes, counter = [], [], []
for count, sentence in enumerate(test_documents):
tokens = []
for n in self.ngram_range:
tokens.extend(ngrams(sentence, n))
tokens_dict = dict(Counter(tokens))
for token, token_count in tokens_dict.iteritems():
token_meta_data = self.bns_scores.get(token, None)
if token_meta_data:
if category in token_meta_data:
scores.append(token_count * token_meta_data[category])
indexes.append(token_meta_data['index'])
counter.append(count)
test_vector[category] = csr_matrix((scores, (counter, indexes)), shape=(len(counter), len(self.bns_scores)))
return test_vector