From 7d818c6b38e43568b774e0b0743e3722d424475b Mon Sep 17 00:00:00 2001 From: "Franco M. Luque" Date: Mon, 25 Jul 2016 20:56:25 -0300 Subject: [PATCH 1/6] New Regular Expression NER. Based on, and improving, NLTKTokenSearcher. --- iepy/preprocess/ner/regexp.py | 104 ++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 iepy/preprocess/ner/regexp.py diff --git a/iepy/preprocess/ner/regexp.py b/iepy/preprocess/ner/regexp.py new file mode 100644 index 0000000..8e7d4bc --- /dev/null +++ b/iepy/preprocess/ner/regexp.py @@ -0,0 +1,104 @@ +import re +import codecs + +from nltk.text import TokenSearcher as NLTKTokenSearcher + +from iepy.preprocess.ner.base import BaseNERRunner + + +class RegExpNERRunner(BaseNERRunner): + + def __init__(self, label, regexp, override=False): + super(RegExpNERRunner, self).__init__(override=override) + self.label = label + self.regexp = regexp + + def run_ner(self, doc): + entities = [] + tokens = doc.tokens + searcher = TokenSearcher(tokens) + for match in searcher.finditer(self.regexp): + entity_oc = self.process_match(match) + if type(entity_oc) == list: + entities.extend(entity_oc) + else: + entities.append(entity_oc) + return entities + + def process_match(self, match): + name = ' '.join(match.group()) + kind = self.label + offset, offset_end = match.span() + entity_oc = self.build_occurrence(name, kind, name, offset, offset_end) + + return entity_oc + + +class TokenSearcher(NLTKTokenSearcher): + + def __init__(self, tokens): + # replace < and > inside tokens with \< and \> + _raw = '><'.join(w.replace('<', '\<').replace('>', '\>') for w in tokens) + # preprend >< instead of < for easier token counting + self._raw = '><' + _raw + '>' + # super(TokenSearcher, self).__init__(tokens) + + def finditer(self, regexp): + regexp = preprocess_regexp(regexp) + + i = re.finditer(regexp, self._raw) + # last_start, last_end = 0, 0 + # token_start, token_end = 0, 0 + while True: + try: + m = next(i) + start, end = m.span() + # FIXME: do not count from the beggining + # token_start = token_start + self._raw[last_start:start].count('><') + # token_end = token_end + self._raw[last_end:end].count('><') + # last_start, last_end = start, end + token_start = self._raw[:start].count('><') + token_end = self._raw[:end].count('><') + yield MatchObject(m, token_start, token_end) + except: + return + + +class MatchObject: + + def __init__(self, m, token_start, token_end): + self.m = m + self.all = m.group() + self.all_start, self.all_end = m.span() + self.token_start = token_start + self.token_end = token_end + + def group(self, *args): + result = self.m.group(*args) + if result: + return result[1:-1].split('><') + else: + return None + + def span(self, *args): + start, end = self.m.span(*args) + span_start = self.all[:start - self.all_start].count('<') + span_end = self.all[:end - self.all_start].count('<') + + return (self.token_start + span_start, self.token_start + span_end) + + +def preprocess_regexp(regexp): + # preprocess the regular expression + regexp = re.sub(r'\s', '', regexp) + # replace < and > only if not double (<< or >>): + # FIXME: avoid matching \< and \>. + regexp = re.sub(r'(?)>(?!>)', ')>)', regexp) + # now, replace << >> with < > resp. + regexp = re.sub(r'<<', '<', regexp) + regexp = re.sub(r'>>', '>', regexp) + # Replace . (if not preceded by \) with [^>] + regexp = re.sub(r'(?]', regexp) + + return regexp From 2e912ee1124093cba66bb0f439714f67fc6f807d Mon Sep 17 00:00:00 2001 From: "Franco M. Luque" Date: Mon, 25 Jul 2016 20:59:06 -0300 Subject: [PATCH 2/6] Basic tests for regexp NER runner. --- tests/test_regexp_ner.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 tests/test_regexp_ner.py diff --git a/tests/test_regexp_ner.py b/tests/test_regexp_ner.py new file mode 100644 index 0000000..43006d9 --- /dev/null +++ b/tests/test_regexp_ner.py @@ -0,0 +1,25 @@ +from iepy.preprocess.ner.regexp import RegExpNERRunner + +from .factories import SentencedIEDocFactory +from .manager_case import ManagerTestCase +from .test_ner import NERTestMixin + + +class TestRegExpNERRunner(ManagerTestCase, NERTestMixin): + + def test(self): + doc = SentencedIEDocFactory( + text="Chase notes she's negative for HIV and Hepatitis C") + ner_runner = RegExpNERRunner('disease', '|<[A-C]>') + ner_runner(doc) + # (the tokenizer splits she's in two parts) + entities_triples = [(6, 7, 'DISEASE'), (8, 10, 'DISEASE')] + self.check_ner_result(doc, entities_triples) + + doc = SentencedIEDocFactory( + text="Cuddy points out that the CT scan showed the patient has a metal pin in her arm and can't undergo an MRI") + ner_runner = RegExpNERRunner('MEDICAL_TEST', '<[A-Z]+>|') + ner_runner(doc) + # (the tokenizer splits can't in two parts) + entities_triples = [(5, 7, 'MEDICAL_TEST'), (22, 23, 'MEDICAL_TEST')] + self.check_ner_result(doc, entities_triples) From 45cf82853be25ebc3d95e574b5eb1ada6f784246 Mon Sep 17 00:00:00 2001 From: "Franco M. Luque" Date: Tue, 9 Aug 2016 09:32:18 -0300 Subject: [PATCH 3/6] Test for regexps with named groups. --- tests/test_regexp_ner.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/tests/test_regexp_ner.py b/tests/test_regexp_ner.py index 43006d9..54eca4a 100644 --- a/tests/test_regexp_ner.py +++ b/tests/test_regexp_ner.py @@ -1,4 +1,6 @@ -from iepy.preprocess.ner.regexp import RegExpNERRunner +from unittest import TestCase + +from iepy.preprocess.ner.regexp import RegExpNERRunner, TokenSearcher from .factories import SentencedIEDocFactory from .manager_case import ManagerTestCase @@ -23,3 +25,20 @@ def test(self): # (the tokenizer splits can't in two parts) entities_triples = [(5, 7, 'MEDICAL_TEST'), (22, 23, 'MEDICAL_TEST')] self.check_ner_result(doc, entities_triples) + + +class TestTokenSearcher(TestCase): + + def test(self): + sent = "Chase notes she 's negative for HIV and Hepatitis C" + regexp = '|<[A-C]>' + searcher = TokenSearcher(sent.split()) + matches = [(m.group(), m.span()) for m in searcher.finditer(regexp)] + self.assertEqual(matches, [(['HIV'], (6, 7)), (['Hepatitis', 'C'], (8, 10))]) + + def test_named_group(self): + sent = "Cuddy points out that the CT scan showed the patient has a metal pin in her arm and can 't undergo an MRI" + regexp = '(|) (?P<> <[A-Z]+>|)' + searcher = TokenSearcher(sent.split()) + matches = [(m.group('name'), m.span('name')) for m in searcher.finditer(regexp)] + self.assertEqual(matches, [(['CT', 'scan'], (5, 7)), (['MRI'], (22, 23))]) From a9e5d70fa790e6f5f2e9184866d834ee26b7cbd1 Mon Sep 17 00:00:00 2001 From: "Franco M. Luque" Date: Tue, 9 Aug 2016 09:39:29 -0300 Subject: [PATCH 4/6] Remove commented code. --- iepy/preprocess/ner/regexp.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/iepy/preprocess/ner/regexp.py b/iepy/preprocess/ner/regexp.py index 8e7d4bc..0aaf692 100644 --- a/iepy/preprocess/ner/regexp.py +++ b/iepy/preprocess/ner/regexp.py @@ -39,24 +39,18 @@ class TokenSearcher(NLTKTokenSearcher): def __init__(self, tokens): # replace < and > inside tokens with \< and \> _raw = '><'.join(w.replace('<', '\<').replace('>', '\>') for w in tokens) - # preprend >< instead of < for easier token counting + # preprend >< instead of < for easier token counting self._raw = '><' + _raw + '>' - # super(TokenSearcher, self).__init__(tokens) def finditer(self, regexp): regexp = preprocess_regexp(regexp) i = re.finditer(regexp, self._raw) - # last_start, last_end = 0, 0 - # token_start, token_end = 0, 0 while True: try: m = next(i) start, end = m.span() # FIXME: do not count from the beggining - # token_start = token_start + self._raw[last_start:start].count('><') - # token_end = token_end + self._raw[last_end:end].count('><') - # last_start, last_end = start, end token_start = self._raw[:start].count('><') token_end = self._raw[:end].count('><') yield MatchObject(m, token_start, token_end) From 34ca21936b9db8373d55903923b6b45cfcdc84a0 Mon Sep 17 00:00:00 2001 From: "Franco M. Luque" Date: Mon, 12 Dec 2016 17:30:44 -0300 Subject: [PATCH 5/6] Some minimal documentation. --- iepy/preprocess/ner/regexp.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/iepy/preprocess/ner/regexp.py b/iepy/preprocess/ner/regexp.py index 0aaf692..8343566 100644 --- a/iepy/preprocess/ner/regexp.py +++ b/iepy/preprocess/ner/regexp.py @@ -35,6 +35,17 @@ def process_match(self, match): class TokenSearcher(NLTKTokenSearcher): + """ + From nltk.text.TokenSearcher documentation: + + A class that makes it easier to use regular expressions to search + over tokenized strings. The tokenized string is converted to a + string where tokens are marked with angle brackets -- e.g., + ``''``. The regular expression + passed to the ``findall()`` method is modified to treat angle + brackets as non-capturing parentheses, in addition to matching the + token boundaries; and to have ``'.'`` not match the angle brackets. + """ def __init__(self, tokens): # replace < and > inside tokens with \< and \> From c817913e906dc1daefb33d8adc29940a756819fc Mon Sep 17 00:00:00 2001 From: "Franco M. Luque" Date: Tue, 20 Dec 2016 17:02:43 -0300 Subject: [PATCH 6/6] Remove FIXME comments for things that actually are possible enhancements (as discussed in the pull request). --- iepy/preprocess/ner/regexp.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/iepy/preprocess/ner/regexp.py b/iepy/preprocess/ner/regexp.py index 8343566..c29b44a 100644 --- a/iepy/preprocess/ner/regexp.py +++ b/iepy/preprocess/ner/regexp.py @@ -61,7 +61,6 @@ def finditer(self, regexp): try: m = next(i) start, end = m.span() - # FIXME: do not count from the beggining token_start = self._raw[:start].count('><') token_end = self._raw[:end].count('><') yield MatchObject(m, token_start, token_end) @@ -97,7 +96,6 @@ def preprocess_regexp(regexp): # preprocess the regular expression regexp = re.sub(r'\s', '', regexp) # replace < and > only if not double (<< or >>): - # FIXME: avoid matching \< and \>. regexp = re.sub(r'(?)>(?!>)', ')>)', regexp) # now, replace << >> with < > resp.