Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Regular expression NER #118

Open
wants to merge 6 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions iepy/preprocess/ner/regexp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import re
import codecs

from nltk.text import TokenSearcher as NLTKTokenSearcher
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This import looks unused since (apparently) no method from NLTKTokenSearcher is used.


from iepy.preprocess.ner.base import BaseNERRunner


class RegExpNERRunner(BaseNERRunner):

def __init__(self, label, regexp, override=False):
super(RegExpNERRunner, self).__init__(override=override)
self.label = label
self.regexp = regexp

def run_ner(self, doc):
entities = []
tokens = doc.tokens
searcher = TokenSearcher(tokens)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TokenSearcher is implemented as a class but it is stateless in a practical sense and it is used more like a function than a class.

for match in searcher.finditer(self.regexp):
entity_oc = self.process_match(match)
if type(entity_oc) == list:
entities.extend(entity_oc)
else:
entities.append(entity_oc)
return entities

def process_match(self, match):
name = ' '.join(match.group())
kind = self.label
offset, offset_end = match.span()
entity_oc = self.build_occurrence(name, kind, name, offset, offset_end)

return entity_oc


class TokenSearcher(NLTKTokenSearcher):
"""
From nltk.text.TokenSearcher documentation:

A class that makes it easier to use regular expressions to search
over tokenized strings. The tokenized string is converted to a
string where tokens are marked with angle brackets -- e.g.,
``'<the><window><is><still><open>'``. The regular expression
passed to the ``findall()`` method is modified to treat angle
brackets as non-capturing parentheses, in addition to matching the
token boundaries; and to have ``'.'`` not match the angle brackets.
"""

def __init__(self, tokens):
# replace < and > inside tokens with \< and \>
_raw = '><'.join(w.replace('<', '\<').replace('>', '\>') for w in tokens)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm no completely sure, but would it be a problem if there is a token with \< (or \>) inside it?

# preprend >< instead of < for easier token counting
self._raw = '><' + _raw + '>'

def finditer(self, regexp):
regexp = preprocess_regexp(regexp)

i = re.finditer(regexp, self._raw)
while True:
try:
m = next(i)
start, end = m.span()
token_start = self._raw[:start].count('><')
token_end = self._raw[:end].count('><')
yield MatchObject(m, token_start, token_end)
except:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This try...except is dangerous because it silently hides any error that can happen inside the loop.

Why not to use for m in i: instead?

return


class MatchObject:

def __init__(self, m, token_start, token_end):
self.m = m
self.all = m.group()
self.all_start, self.all_end = m.span()
self.token_start = token_start
self.token_end = token_end

def group(self, *args):
result = self.m.group(*args)
if result:
return result[1:-1].split('><')
else:
return None

def span(self, *args):
start, end = self.m.span(*args)
span_start = self.all[:start - self.all_start].count('<')
span_end = self.all[:end - self.all_start].count('<')

return (self.token_start + span_start, self.token_start + span_end)


def preprocess_regexp(regexp):
# preprocess the regular expression
regexp = re.sub(r'\s', '', regexp)
# replace < and > only if not double (<< or >>):
regexp = re.sub(r'(?<!<)<(?!<)', '(?:<(?:', regexp)
regexp = re.sub(r'(?<!>)>(?!>)', ')>)', regexp)
# now, replace << >> with < > resp.
regexp = re.sub(r'<<', '<', regexp)
regexp = re.sub(r'>>', '>', regexp)
# Replace . (if not preceded by \) with [^>]
regexp = re.sub(r'(?<!\\)\.', '[^>]', regexp)

return regexp
44 changes: 44 additions & 0 deletions tests/test_regexp_ner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from unittest import TestCase

from iepy.preprocess.ner.regexp import RegExpNERRunner, TokenSearcher

from .factories import SentencedIEDocFactory
from .manager_case import ManagerTestCase
from .test_ner import NERTestMixin


class TestRegExpNERRunner(ManagerTestCase, NERTestMixin):

def test(self):
doc = SentencedIEDocFactory(
text="Chase notes she's negative for HIV and Hepatitis C")
ner_runner = RegExpNERRunner('disease', '<HIV>|<Hepatitis><[A-C]>')
ner_runner(doc)
# (the tokenizer splits she's in two parts)
entities_triples = [(6, 7, 'DISEASE'), (8, 10, 'DISEASE')]
self.check_ner_result(doc, entities_triples)

doc = SentencedIEDocFactory(
text="Cuddy points out that the CT scan showed the patient has a metal pin in her arm and can't undergo an MRI")
ner_runner = RegExpNERRunner('MEDICAL_TEST', '<[A-Z]+><scan>|<MRI>')
ner_runner(doc)
# (the tokenizer splits can't in two parts)
entities_triples = [(5, 7, 'MEDICAL_TEST'), (22, 23, 'MEDICAL_TEST')]
self.check_ner_result(doc, entities_triples)


class TestTokenSearcher(TestCase):

def test(self):
sent = "Chase notes she 's negative for HIV and Hepatitis C"
regexp = '<HIV>|<Hepatitis><[A-C]>'
searcher = TokenSearcher(sent.split())
matches = [(m.group(), m.span()) for m in searcher.finditer(regexp)]
self.assertEqual(matches, [(['HIV'], (6, 7)), (['Hepatitis', 'C'], (8, 10))])

def test_named_group(self):
sent = "Cuddy points out that the CT scan showed the patient has a metal pin in her arm and can 't undergo an MRI"
regexp = '(<an>|<the>) (?P<<name>> <[A-Z]+><scan>|<MRI>)'
searcher = TokenSearcher(sent.split())
matches = [(m.group('name'), m.span('name')) for m in searcher.finditer(regexp)]
self.assertEqual(matches, [(['CT', 'scan'], (5, 7)), (['MRI'], (22, 23))])