Skip to content

Commit

Permalink
use SequencePOSTagger as default POSTagger
Browse files Browse the repository at this point in the history
  • Loading branch information
nournia committed Mar 20, 2015
1 parent 3188f22 commit 5505991
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 48 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Python library for digesting Persian text.
'رفت#رو'

>>> from hazm import POSTagger
>>> tagger = POSTagger(path_to_jar='resources/stanford-postagger.jar', path_to_model='resources/persian.tagger')
>>> tagger = POSTagger(model='resources/postagger.model')
>>> tagger.tag(word_tokenize('ما بسیار کتاب می‌خوانیم'))
[('ما', 'PRO'), ('بسیار', 'ADV'), ('کتاب', 'N'), ('می‌خوانیم', 'V')]

Expand Down
56 changes: 24 additions & 32 deletions data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@
from collections import Counter
from itertools import islice
from nltk.tag import untag
from nltk.parse import DependencyEvaluator
from sklearn.cross_validation import train_test_split
from hazm import *
from hazm.Chunker import tree2brackets
from hazm.PeykareReader import coarse_pos_e as peykare_coarse_pos_e
from hazm.DadeganReader import coarse_pos_e as dadegan_coarse_pos_e


def create_words_file(dic_file='resources/persian.dic', output='hazm/data/words.dat'):
Expand Down Expand Up @@ -65,7 +68,7 @@ def evaluate_chunker(treebank_root='corpora/treebank'):
print(file=output)


def train_postagger(peykare_root='corpora/peykare', model_file='resources/postagger.model', test_size=.1, sents_limit=None, pos_map=PeykareReader.coarse_pos_e):
def train_postagger(peykare_root='corpora/peykare', model_file='resources/postagger.model', test_size=.1, sents_limit=None, pos_map=peykare_coarse_pos_e):

tagger = SequencePOSTagger(type='crf', algo='rprop', compact=True, patterns=[
'*',
Expand Down Expand Up @@ -107,9 +110,9 @@ def train_postagger(peykare_root='corpora/peykare', model_file='resources/postag
print(tagger.evaluate(test_sents))


def train_chunker(train_file='corpora/train.conll', validation_file='corpora/validation.conll', test_file='corpora/test.conll', model_file='resources/chunker.model', pos_map=DadeganReader.coarse_pos_e):
def train_chunker(train_file='corpora/train.conll', dev_file='corpora/dev.conll', test_file='corpora/test.conll', model_file='resources/chunker.model', pos_map=dadegan_coarse_pos_e):

tagger = SequencePOSTagger(model='resources/postagger.model')
tagger = POSTagger(model='resources/postagger.model')
chunker = Chunker(type='crf', algo='l-bfgs', compact=True, patterns=[
'*',

Expand All @@ -126,17 +129,14 @@ def train_chunker(train_file='corpora/train.conll', validation_file='corpora/val
'*:trr=%x[2,1]',
])

train, validation, test = DadeganReader(train_file, pos_map=pos_map), DadeganReader(validation_file, pos_map=pos_map), DadeganReader(test_file, pos_map=pos_map)

def retag_trees(trees, sents):
tagged_sents = tagger.tag_sents([untag(sent) for sent in sents])
for tree, sentence in zip(trees, tagged_sents):
for tree, sentence in zip(trees, tagger.tag_sents(map(untag, sents))):
for (n, word) in zip(tree.treepositions('leaves'), sentence):
tree[n] = word

train, test = DadeganReader(train_file, pos_map=pos_map), DadeganReader(test_file, pos_map=pos_map)
train_trees = list(train.chunked_trees())
train_sents = list(train.sents())
retag_trees(train_trees, train_sents)
retag_trees(train_trees, train.sents())
chunker.train(train_trees)
chunker.save_model(model_file)

Expand All @@ -145,49 +145,41 @@ def retag_trees(trees, sents):
print(chunker.evaluate(test_trees))


def train_maltparser(train_file='corpora/train.conll', validation_file='corpora/validation.conll', test_file='corpora/test.conll', model_file='langModel.mco', path_to_jar='resources/malt.jar', options_file='resources/malt-options.xml', features_file='resources/malt-features.xml', memory_min='-Xms7g', memory_max='-Xmx8g'):
def train_maltparser(train_file='corpora/train.conll', dev_file='corpora/dev.conll', test_file='corpora/test.conll', model_file='langModel.mco', path_to_jar='resources/malt.jar', options_file='resources/malt-options.xml', features_file='resources/malt-features.xml', memory_min='-Xms7g', memory_max='-Xmx8g'):

lemmatizer, tagger = Lemmatizer(), POSTagger()
train, validation, test = DadeganReader(train_file), DadeganReader(validation_file), DadeganReader(test_file)
train_sents = list(train.sents()) + list(validation.sents())
train_trees = list(train.trees()) + list(validation.trees())
lemmatizer, tagger = Lemmatizer(), POSTagger(model='resources/postagger.model')

train, test = DadeganReader(train_file), DadeganReader(test_file)
train_data = train_file +'.data'
with codecs.open(train_data, 'w', 'utf8') as output:
for tree, sentence in zip(train_trees, tagger.tag_sents(train_sents)):
for i, (node, word) in enumerate(zip(tree.nodelist[1:], sentence), start=1):
node['tag'] = word[1]
node['lemma'] = lemmatizer.lemmatize(node['word'].replace('_', ' '), node['tag'])
print(i, node['word'].replace(' ', '_'), node['lemma'].replace(' ', '_'), node['tag'], node['tag'], '_', node['head'], node['rel'], '_', '_', sep='\t', file=output)
for tree, sentence in zip(train.trees(), tagger.tag_sents(map(untag, train.sents()))):
for i, (node, word) in enumerate(zip(list(tree.nodes.values())[1:], sentence), start=1):
node['mtag'] = word[1]
node['lemma'] = lemmatizer.lemmatize(node['word'], node['mtag'])
print(i, node['word'].replace(' ', '_'), node['lemma'].replace(' ', '_'), node['mtag'], node['mtag'], '_', node['head'], node['rel'], '_', '_', sep='\t', file=output)
print(file=output)

subprocess.Popen(['java', memory_min, memory_max, '-jar', path_to_jar, '-w', 'resources', '-c', model_file, '-i', train_data, '-f', options_file, '-F', features_file, '-m', 'learn']).wait()

# evaluation
print('\nEvaluating trained model on test data:')
parser = DependencyParser(tagger=tagger, model_file=model_file)

tagged = tagger.tag_sents(test.sents())
parsed = parser.tagged_parse_sents(tagged)
parser = DependencyParser(tagger=tagger, lemmatizer=lemmatizer, model_file=model_file)
parsed_trees = parser.parse_sents(map(untag, test.sents()))

test_data, test_results = test_file +'.data', test_file +'.results'
print('\n'.join([sentence.to_conll(10).replace('/', '') for sentence in test.trees()]).strip(), file=codecs.open(test_data, 'w', 'utf8'))
print('\n'.join([sentence.to_conll(10) for sentence in parsed]).strip(), file=codecs.open(test_results, 'w', 'utf8'))

print('\n'.join([tree.to_conll(10) for tree in test.trees()]).strip(), file=codecs.open(test_data, 'w', 'utf8'))
print('\n'.join([tree.to_conll(10) for tree in parsed_trees]).strip(), file=codecs.open(test_results, 'w', 'utf8'))
subprocess.Popen(['java', '-jar', 'resources/MaltEval.jar', '-g', test_data, '-s', test_results]).wait()


def train_stanford_postagger(peykare_root='corpora/peykare', path_to_model='resources/persian.tagger', path_to_jar='resources/stanford-postagger.jar', properties_file='resources/stanford-postagger.props', memory_min='-Xms1g', memory_max='-Xmx6g', test_size=.1):
peykare = PeykareReader(peykare_root)
def train_stanford_postagger(peykare_root='corpora/peykare', path_to_model='resources/persian.tagger', path_to_jar='resources/stanford-postagger.jar', properties_file='resources/stanford-postagger.props', memory_min='-Xms1g', memory_max='-Xmx6g', test_size=.1, pos_map=peykare_coarse_pos_e):
peykare = PeykareReader(peykare_root, pos_map=pos_map)
train_file = 'resources/tagger_train_data.txt'
train, test = train_test_split(list(peykare.sents()), test_size=test_size, random_state=0)
print('Peykare loaded.')

output = codecs.open(train_file, 'w', 'utf8')
for sentence in train:
print(*(map(lambda w: '/'.join(w).replace(' ', '_'), sentence)), file=output)
subprocess.Popen(['java', memory_min, memory_max, '-classpath', path_to_jar, 'edu.stanford.nlp.tagger.maxent.MaxentTagger', '-prop', properties_file, '-model', path_to_model, '-trainFile', train_file, '-tagSeparator', '/', '-search', 'owlqn2']).wait()

tagger = POSTagger()
print('Tagger Accuracy on Test Split:')
tagger = StanfordPOSTagger(path_to_jar=path_to_jar, path_to_model=path_to_model)
print(tagger.evaluate(test))
12 changes: 5 additions & 7 deletions hazm/Chunker.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# coding: utf8

from __future__ import unicode_literals
from nltk.chunk import ChunkParserI, RegexpParser, ChunkScore, tree2conlltags, conlltags2tree
from nltk.tag import untag
from nltk.chunk import ChunkParserI, RegexpParser, tree2conlltags, conlltags2tree
from .SequenceTagger import IOBTagger


Expand All @@ -26,10 +25,9 @@ def tree2brackets(tree):

class Chunker(IOBTagger, ChunkParserI):
"""
>>> # from hazm import POSTagger
>>> # chunker = Chunker(tagger=POSTagger(), model='resources/chunker.model')
>>> # tree2brackets(chunker.parse([('نامه', 'Ne'), ('۱۰', 'NUM'), ('فوریه', 'Ne'), ('شما', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')]))
'[نامه ۱۰ فوریه شما NP] [را POSTP] [دریافت داشتم VP] .'
>>> chunker = Chunker(model='resources/chunker.model')
>>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('ایشان', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')]))
'[نامه ایشان NP] [را POSTP] [دریافت داشتم VP] .'
"""

def train(self, trees):
Expand All @@ -49,7 +47,7 @@ def evaluate(self, gold):
class RuleBasedChunker(RegexpParser):
"""
>>> chunker = RuleBasedChunker()
>>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('۱۰', 'NUM'), ('فوریه', 'Ne'), ('شما', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')]))
>>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('۱۰', 'NUMe'), ('فوریه', 'Ne'), ('شما', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')]))
'[نامه ۱۰ فوریه شما NP] [را POSTP] [دریافت داشتم VP] .'
"""

Expand Down
5 changes: 2 additions & 3 deletions hazm/DependencyParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
class DependencyParser(MaltParser):
"""
>>> from hazm import POSTagger, Lemmatizer
>>> tagger = POSTagger(path_to_jar='resources/stanford-postagger.jar', path_to_model='resources/persian.tagger')
>>> parser = DependencyParser(tagger=tagger, lemmatizer=Lemmatizer())
>>> parser = DependencyParser(tagger=POSTagger(model='resources/postagger.model'), lemmatizer=Lemmatizer())
>>> parser.parse(['من', 'به', 'مدرسه', 'رفته بودم', '.']).tree().pprint()
(رفته_بودم من (به مدرسه) .)
"""
Expand Down Expand Up @@ -42,7 +41,7 @@ def tagged_parse_sents(self, sentences, verbose=False):
if self._execute(cmd, verbose) != 0:
raise Exception("MaltParser parsing failed: %s" % (' '.join(cmd)))

return (DependencyGraph(item) for item in codecs.open(output_file.name, encoding='utf8').read().split('\n\n'))
return (DependencyGraph(item) for item in codecs.open(output_file.name, encoding='utf8').read().split('\n\n') if item.strip())

finally:
input_file.close()
Expand Down
12 changes: 8 additions & 4 deletions hazm/POSTagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@
from .SequenceTagger import SequenceTagger


class SequencePOSTagger(SequenceTagger):
pass
class POSTagger(SequenceTagger):
"""
>>> tagger = POSTagger(model='resources/postagger.model')
>>> tagger.tag(['من', 'به', 'مدرسه', 'رفته_بودم', '.'])
[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]
"""


class POSTagger(stanford.POSTagger):
class StanfordPOSTagger(stanford.POSTagger):
"""
>>> tagger = POSTagger(path_to_jar='resources/stanford-postagger.jar', path_to_model='resources/persian.tagger')
>>> tagger = StanfordPOSTagger(path_to_jar='resources/stanford-postagger.jar', path_to_model='resources/persian.tagger')
>>> tagger.tag(['من', 'به', 'مدرسه', 'رفته_بودم', '.'])
[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]
"""
Expand Down
2 changes: 1 addition & 1 deletion hazm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .Stemmer import Stemmer
from .Lemmatizer import Lemmatizer
from .SequenceTagger import SequenceTagger, IOBTagger
from .POSTagger import SequencePOSTagger, POSTagger
from .POSTagger import POSTagger, StanfordPOSTagger
from .Chunker import Chunker, RuleBasedChunker
from .DependencyParser import DependencyParser

Expand Down

0 comments on commit 5505991

Please sign in to comment.