Skip to content

Commit

Permalink
clean up tests; fix phonetic()
Browse files Browse the repository at this point in the history
  • Loading branch information
reynoldsnlp committed Nov 20, 2023
1 parent c19e829 commit dd73381
Show file tree
Hide file tree
Showing 21 changed files with 31 additions and 97 deletions.
13 changes: 8 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,16 +74,16 @@ import udar
doc1 = udar.Document('Мы удивились простоте системы.')
print(doc1)
# Мы мы+Pron+Pers+Pl1+Nom 0.000000
#
#
# удивились удивиться+V+Perf+IV+Pst+MFN+Pl 5.078125
#
#
# простоте простота+N+Fem+Inan+Sg+Dat 4.210938
# простоте простота+N+Fem+Inan+Sg+Loc 4.210938
#
#
# системы система+N+Fem+Inan+Pl+Acc 5.429688
# системы система+N+Fem+Inan+Pl+Nom 5.429688
# системы система+N+Fem+Inan+Sg+Gen 5.429688
#
#
# . .+CLB 0.000000
```

Expand Down Expand Up @@ -117,6 +117,7 @@ phonetic transcription.
| hfst\_str | `str` | Analysis stream in the XFST/HFST format |
| from\_hfst | `Document` | Create `Document` from XFST/HFST format stream |
| to\_dict | `list` | Convert to a complex list object |
| to\_html | `str` | Convert to HTML with markup in `data-` attributes |
| to\_json | `str` | Convert to a JSON string |

#### Examples
Expand Down Expand Up @@ -163,6 +164,7 @@ print(phonetic_doc1)
| hfst\_str | `str` | Analysis stream in the XFST/HFST format |
| from\_hfst | `Sentence` | Create `Sentence` from XFST/HFST format stream |
| to\_dict | `list` | Convert to a complex list object |
| to\_html | `str` | Convert to HTML with markup in `data-` attributes |
| to\_json | `str` | Convert to a JSON string |

### `Token` object
Expand All @@ -189,6 +191,7 @@ print(phonetic_doc1)
| cg3\_str | `str` | Analysis stream in the [VISL-CG3 format](https://visl.sdu.dk/cg3/single/#stream-vislcg) |
| hfst\_str | `str` | Analysis stream in the XFST/HFST format |
| to\_dict | `dict` | Convert to a `dict` object |
| to\_html | `str` | Convert to HTML with markup in `data-` attributes |
| to\_json | `str` | Convert to a JSON string |

### `Reading` object
Expand Down Expand Up @@ -340,7 +343,7 @@ print(stressed_generator('слово+N+Neu+Inan+Pl+Nom'))

You can easily check if a morphosyntactic tag is in a `Token`, `Reading`,
or `Subreading` using `in`:

```python
token2 = udar.Token('слова', analyze=True)
print(token2)
Expand Down
4 changes: 2 additions & 2 deletions dev/qa.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ mypy src/udar


echo "Running pytest..."
python3.7 -m pytest --cov=udar --cov-append --cov-report term-missing --doctest-modules
python3 -m pytest --cov=udar --cov-append --cov-report term-missing --doctest-modules


rm .coverage # can conflict with tox
rm -f .coverage # can conflict with tox
echo "If everything passes, run tox."
4 changes: 2 additions & 2 deletions hfst_vislcg3_versions.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Versions with which tests passed for this commit:
hfst-tokenize 0.1 (hfst 3.15.5)
VISL CG-3 Disambiguator version 1.3.4.13891
hfst-tokenize 0.1 (hfst 3.16.0)
VISL CG-3 Disambiguator version 1.4.7.13897
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
[build-system]
requires = [
"setuptools>=45",
"setuptools_scm[toml]>=6.2",
"wheel"
"setuptools>=61",
"setuptools_scm"
]

build-backend = "setuptools.build_meta"
Expand Down
13 changes: 0 additions & 13 deletions src/udar/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
import unicodedata
from warnings import warn

# import nltk (this happens covertly by unpickling nltk_punkt_russian.pkl)

from .fsts import get_analyzer
from .misc import get_stanza_sent_tokenizer
from .sentence import Sentence
Expand All @@ -27,16 +25,6 @@

src = '''Мы все говорили кое о чем с тобой, но по-моему, все это ни к чему, как он сказал. Он стоял в парке и. Ленина.''' # noqa: E501

# Obsolete??
# def get_sent_tokenizer():
# global nltk_sent_tokenizer
# try:
# return nltk_sent_tokenizer
# except NameError:
# with open(f'{RSRC_DIR}/nltk_punkt_russian.pkl', 'rb') as f:
# nltk_sent_tokenizer = pickle.load(f)
# return nltk_sent_tokenizer


def _str2Sentences(input_str, **kwargs) -> List[Sentence]:
stanza_sent = get_stanza_sent_tokenizer()
Expand Down Expand Up @@ -117,7 +105,6 @@ def __getitem__(self, i: Union[int, slice]) -> Union[Token, List[Token]]:
warn('Indexing on large Document objects can be slow. '
'It is more efficient to index Tokens within Document.sentences',
stacklevel=3)
# TODO optimize?
return list(self)[i]

def __iter__(self) -> Iterator[Token]:
Expand Down
2 changes: 1 addition & 1 deletion src/udar/features/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def _get_orig_kwargs(func) -> Dict[str, Any]:
sig = inspect.signature(func)
return {name: param.default
for name, param in sig.parameters.items()
if param.default != inspect._empty} # type: ignore
if param.default != inspect._empty}

def set_default_kwargs(self, default_kwargs=None):
"""Set kwargs to be used in __call__() by default.
Expand Down
2 changes: 1 addition & 1 deletion src/udar/fsts.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(self,
fst_dir: str = FST_DIR):
self.path2fst = f'{fst_dir}/{fname}'
if not Path(self.path2fst).exists():
print('First time use.', end=' ', file=sys.stderr)
print('First time use. Decompressing FSTs...', end=' ', file=sys.stderr)
decompress_fsts(fst_dir=fst_dir)
fst_stream = hfst.HfstInputStream(self.path2fst)
self.fst = fst_stream.read()
Expand Down
7 changes: 4 additions & 3 deletions src/udar/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from collections import namedtuple
from enum import Enum
from importlib.resources import files
import os
from pathlib import Path
from pkg_resources import resource_filename
import re
import sys
from typing import Dict
Expand All @@ -23,7 +23,8 @@

FST_DIR = os.getenv('UDAR_RESOURCES_DIR',
os.path.join(str(Path.home()), 'udar_resources'))
RSRC_DIR = resource_filename('udar', 'resources')
RSRC_DIR = files('udar') / 'resources'
print(RSRC_DIR)

ACUTE = '\u0301' # acute combining accent: x́
GRAVE = '\u0300' # grave combining accent: x̀
Expand Down Expand Up @@ -104,7 +105,7 @@ def compute_metrics(results: Dict[Result, int]):
for old, new in result_names.items():
out_dict[new] = results.get(old, 0)
Metrics = namedtuple('Metrics', sorted(out_dict)) # type: ignore
return Metrics(**out_dict) # type: ignore
return Metrics(**out_dict)


def destress(token: str) -> str:
Expand Down
Binary file removed src/udar/resources/g2p.hfstol.gz
Binary file not shown.
4 changes: 2 additions & 2 deletions src/udar/resources/src/make_pkls.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from collections import defaultdict
from importlib.resources import files
import pickle
from pkg_resources import resource_filename
import re
from statistics import mean
from sys import stderr


RSRC_DIR = resource_filename('udar', 'resources')
RSRC_DIR = files('udar') / 'resources'


def tixonov():
Expand Down
2 changes: 1 addition & 1 deletion src/udar/sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def get_tokenizer(use_pexpect=True) -> Tokenizer:
raise ModuleNotFoundError('Neither hfst or nltk are installed. '
'One of them must be installed for '
'tokenization.') from e
except LookupError as e:
except LookupError:
# punkt not found
try:
# attempt to download punkt
Expand Down
2 changes: 1 addition & 1 deletion src/udar/tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def info(self):
('Prb', 'CONFIDENCE', '+Prb(lematic): затруднительно - предположительно - нет'), # noqa: E501
('Fac', 'CONFIDENCE', 'Facultative'),
('PObj', 'SYNTACTIC', 'Object of preposition (epenthetic н: него нее них)'), # noqa: E501
('Epenth', 'PHON', 'epenthesis on prepositions (о~об~обо or в~во)'), # noqa: E501
('Epenth', 'PHON', 'epenthesis on prepositions (о~об~обо or в~во)'),
('Leng', 'PHON', 'Lengthened доброй~доброю (marks less-canonical wordform that has more syllables)'), # noqa: E501
('Elid', 'PHON', 'Elided (Иванович~Иваныч, новее~новей, чтобы~чтоб, или~иль, коли~коль)'), # noqa: E501
('Use/NG', 'USE', 'Do not generate (used for apertium, etc.)'),
Expand Down
38 changes: 4 additions & 34 deletions src/udar/tok.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from typing import Union

from .fsts import get_analyzer
from .fsts import get_g2p
from .fsts import get_generator
from .misc import combine_stress
from .misc import destress
Expand Down Expand Up @@ -624,54 +623,25 @@ def phonetic(self, *, selection: str = 'safe', guess: bool = False,
# :py:class:`Sentence` has undergone CG3 disambiguation.
# """
# TODO check if `all` selection is compatible with g2p.hfstol
# TODO make this function suck less
g2p = get_g2p()
if lemma:
self.removed_readings.extend([r for r in self.readings
if lemma not in r.lemmas])
self.readings = [r for r in self.readings if lemma in r.lemmas]
transcriptions = self.phonetic_transcriptions()
if not transcriptions:
if guess:
stress_pred = self.guess_syllable()
elif _experiment:
stress_pred = destress(self.text)
else:
stress_pred = self.text
return f'__0__{self.text}__0__'
elif len(transcriptions) == 1:
return transcriptions.pop()
else: # if there are more than one possible transcription
if selection == 'safe':
if _experiment:
stress_pred = destress(self.text)
else:
stress_pred = self.text
return f'__{self.text}__'
elif selection == 'rand':
return choice(list(transcriptions))
elif selection == 'freq':
raise NotImplementedError("The 'freq' selection method is not "
'implemented yet.')
elif selection == 'all':
stresses = self.stresses()
acutes = [(w.replace(GRAVE, '').index(ACUTE), ACUTE)
for w in stresses if ACUTE in w]
graves = [(w.replace(ACUTE, '').index(GRAVE), GRAVE)
for w in stresses if GRAVE in w]
yos = [(w.replace(GRAVE, '').replace(ACUTE, '').index('ё'), 'ё') # noqa: E501
for w in stresses if 'ё' in w]
positions = acutes + graves + yos
word = list(destress(transcriptions.pop()))
for i, char in sorted(positions, key=lambda x: (-x[0], x[1]),
reverse=True):
if char in (ACUTE, GRAVE):
word.insert(i, char)
else: # 'ё'
word[i] = char
stress_pred = ''.join(word)
raise NotImplementedError("The 'all' selection method is not "
'implemented yet.')
else:
raise NotImplementedError(f"The '{selection}' selection "
'method does not exist.')
return g2p.lookup(stress_pred)[0][0]
# TODO Are Y and S still implemented in g2p.twolc?
# if out_token.endswith("я") or out_token.endswith("Я"):
# out_token += "Y"
Expand Down
2 changes: 1 addition & 1 deletion test/test_README.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_block_extraction():

def test_blocks():
for block in blocks:
for code, expected_out in re.findall(r'((?:[^#].*\n)+)((?:# .*\n)*)',
for code, expected_out in re.findall(r'((?:[^#].*\n)+)((?:# ?.*\n)*)',
block):
expected_out = re.sub('^# ?', '', expected_out, flags=re.M).strip()
with stdoutIO() as s:
Expand Down
3 changes: 0 additions & 3 deletions test/test_convenience.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from pkg_resources import resource_filename

import pytest

from udar import convenience
from udar import Document


RSRC_PATH = resource_filename('udar', 'resources/')
sent = 'Иванов и Сыроежкин говорили полчаса кое с кем о лицах, "ртах" и т.д.'


Expand Down
3 changes: 0 additions & 3 deletions test/test_features.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
from functools import partial
import inspect
from pkg_resources import resource_filename
import re

import udar
from udar.features import ALL


RSRC_PATH = resource_filename('udar', 'resources/')

# TODO implement text_path to share resources across tests
text_path = 'resources/sent1.txt'
text = 'Иванов и Сыроежкин говорили полчаса кое с кем о лицах, "ртах" и т.д.'
Expand Down
5 changes: 0 additions & 5 deletions test/test_fsts.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
from pkg_resources import resource_filename

import udar


RSRC_PATH = resource_filename('udar', 'resources/')


def test_accented_generator():
gen = udar.get_generator(stressed=True)
word = gen('слово+N+Neu+Inan+Sg+Gen')
Expand Down
3 changes: 0 additions & 3 deletions test/test_misc.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
from pkg_resources import resource_filename

import udar


RSRC_PATH = resource_filename('udar', 'resources/')
sent = 'Иванов и Сыроежкин говорили полчаса кое с кем о лицах, "ртах" и т.д.'


Expand Down
8 changes: 2 additions & 6 deletions test/test_reading.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import pickle
from pkg_resources import resource_filename

import udar


RSRC_PATH = resource_filename('udar', 'resources/')


def test_rule():
r = udar.reading.Reading(*("слово+N+Neu+Inan+Pl+Ins",
'5.975586',
'SELECT:41:stuff'))
assert type(r) == udar.reading.Reading
assert isinstance(r, udar.reading.Reading)
assert r.lemmas == ['слово']
assert r.grouped_tags == 'N+Neu+Inan+Pl+Ins'.split('+')
assert r.weight == '5.975586'
Expand All @@ -20,7 +16,7 @@ def test_rule():

def test_multiple_subreadings():
r = udar.reading.Reading(*('и т.д.+Abbr#.+SENT', '0.000000'))
assert type(r) == udar.reading.Reading
assert isinstance(r, udar.reading.Reading)
assert len(r.subreadings) == 2


Expand Down
5 changes: 0 additions & 5 deletions test/test_subreading.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
import pickle
from pkg_resources import resource_filename

import pytest

import udar


RSRC_PATH = resource_filename('udar', 'resources/')


def test_simple():
s = udar.reading.Subreading('слово+N+Neu+Inan+Pl+Ins')
assert s.lemma == 'слово'
Expand Down
3 changes: 0 additions & 3 deletions test/test_tag.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
from pkg_resources import resource_filename

import udar


RSRC_PATH = resource_filename('udar', 'resources/')
sent = 'Иванов и Сыроежкин говорили полчаса кое с кем о лицах, "ртах" и т.д.'


Expand Down

0 comments on commit dd73381

Please sign in to comment.