diff --git a/README.md b/README.md index 35335f66..62ae45d7 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ Python library for digesting Persian text. >>> from hazm import POSTagger >>> tagger = POSTagger() >>> tagger.tag(word_tokenize('ما بسیار کتاب می‌خوانیم')) -[('ما', 'PR'), ('بسیار', 'ADV'), ('کتاب', 'N'), ('می‌خوانیم', 'V')] +[('ما', 'PRO'), ('بسیار', 'ADV'), ('کتاب', 'N'), ('می‌خوانیم', 'V')] >>> from hazm import DependencyParser >>> parser = DependencyParser(tagger=POSTagger()) diff --git a/data.py b/data.py index 19c9ac2d..d86210f4 100644 --- a/data.py +++ b/data.py @@ -105,7 +105,7 @@ def train_maltparser(train_file='resources/train.conll', validation_file='resour parsed = parser.tagged_parse_sents(tagged) test_data, test_results = test_file +'.data', test_file +'.results' - print('\n'.join([sentence.to_conll(10) for sentence in test.trees()]).strip(), file=codecs.open(test_data, 'w', 'utf8')) + print('\n'.join([sentence.to_conll(10).replace('/', '') for sentence in test.trees()]).strip(), file=codecs.open(test_data, 'w', 'utf8')) print('\n'.join([sentence.to_conll(10) for sentence in parsed]).strip(), file=codecs.open(test_results, 'w', 'utf8')) subprocess.Popen(['java', '-jar', 'resources/MaltEval.jar', '-g', test_data, '-s', test_results]).wait() diff --git a/hazm/POSTagger.py b/hazm/POSTagger.py index 2e589113..39acd02d 100644 --- a/hazm/POSTagger.py +++ b/hazm/POSTagger.py @@ -18,7 +18,7 @@ def __init__(self, *args, **kwargs): def tag_sents(self, sentences): """ >>> tagger.tag(['من', 'به', 'مدرسه', 'رفته بودم', '.']) - [('من', 'PR'), ('به', 'PREP'), ('مدرسه', 'N'), ('رفته بودم', 'V'), ('.', 'PUNC')] + [('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته بودم', 'V'), ('.', 'PUNC')] """ refined = map(lambda s: [w.replace(' ', '_') for w in s], sentences)