diff --git a/quepy/dot_generation.py b/quepy/dot_generation.py index e812990..b1dafc3 100644 --- a/quepy/dot_generation.py +++ b/quepy/dot_generation.py @@ -5,13 +5,14 @@ """ import random +import six from quepy.expression import isnode from quepy.dsl import IsRelatedTo, HasKeyword from quepy.encodingpolicy import assert_valid_encoding def escape(x, add_quotes=True): - x = unicode(x) + x = six.text_type(x) x = x.replace(u" ", u"_") x = x.replace(u"\n", u"") x = x.replace(u"\00", u"") @@ -29,13 +30,13 @@ def adapt(x): if isnode(x): x = u"x{}".format(x) return x - if isinstance(x, basestring): + if isinstance(x, six.string_types): assert_valid_encoding(x) x = escape(x) if x.startswith(u"\""): return x return u'"{}"'.format(x) - return unicode(x) + return six.text_type(x) def expression_to_dot(e): diff --git a/quepy/encodingpolicy.py b/quepy/encodingpolicy.py index a415f59..70ef20a 100644 --- a/quepy/encodingpolicy.py +++ b/quepy/encodingpolicy.py @@ -12,6 +12,7 @@ """ import logging +import six from quepy import settings logger = logging.getLogger("quepy.encodingpolicy") @@ -25,7 +26,7 @@ def encoding_flexible_conversion(string, complain=False): converting a string that had to be on the right encoding. """ - if isinstance(string, unicode): + if isinstance(string, six.text_type): return string try: ustring = string.decode(settings.DEFAULT_ENCODING) @@ -44,5 +45,5 @@ def assert_valid_encoding(string): ValueError exception. """ - if not isinstance(string, unicode): + if not isinstance(string, six.text_type): raise ValueError(u"Argument must be unicode") diff --git a/quepy/expression.py b/quepy/expression.py index 0f32310..dddfba2 100644 --- a/quepy/expression.py +++ b/quepy/expression.py @@ -90,6 +90,7 @@ from collections import defaultdict from copy import deepcopy +import six def isnode(x): @@ -174,7 +175,7 @@ def iter_nodes(self): """ Iterates the indexes (the unique identifiers) of the Expression nodes. """ - return xrange(len(self.nodes)) + return six.moves.xrange(len(self.nodes)) def iter_edges(self, node): """ diff --git a/quepy/mql_generation.py b/quepy/mql_generation.py index 97b3bd7..0531df6 100644 --- a/quepy/mql_generation.py +++ b/quepy/mql_generation.py @@ -2,6 +2,7 @@ import re import json +import six from quepy.dsl import IsRelatedTo from quepy.expression import isnode from quepy.encodingpolicy import encoding_flexible_conversion @@ -25,13 +26,13 @@ def safely_to_unicode(x): Given an "edge" (a relation) or "a data" from an `Expression` graph transform it into a unicode string fitted for insertion into a MQL query. """ - if isinstance(x, unicode): + if isinstance(x, six.text_type): return x if isinstance(x, str): return encoding_flexible_conversion(x) if isinstance(x, IsRelatedTo): return u"/type/reflect/any_master" - return unicode(x) # FIXME: Any object is unicode-able, this is error prone + return six.text_type(x) # FIXME: Any object is unicode-able, this is error prone def to_bidirected_graph(e): diff --git a/quepy/nltktagger.py b/quepy/nltktagger.py index 8c9149d..8693252 100644 --- a/quepy/nltktagger.py +++ b/quepy/nltktagger.py @@ -15,6 +15,7 @@ # - "maxent_treebank_pos_tagger" in Models # - "wordnet" in Corpora +import six import nltk from quepy.tagger import Word from quepy.encodingpolicy import assert_valid_encoding @@ -25,7 +26,7 @@ def penn_to_morphy_tag(tag): assert_valid_encoding(tag) - for penn, morphy in _penn_to_morphy_tag.iteritems(): + for penn, morphy in six.iteritems(_penn_to_morphy_tag): if tag.startswith(penn): return morphy return None @@ -62,12 +63,14 @@ def run_nltktagger(string, nltk_data_path=None): word = Word(token) # Eliminates stuff like JJ|CC # decode ascii because they are the penn-like POS tags (are ascii). - word.pos = pos.split("|")[0].decode("ascii") + word_pos = pos.split("|")[0] + do_decode = isinstance(word_pos, six.binary_type) + word.pos = word_pos.decode("ascii") if do_decode else word_pos mtag = penn_to_morphy_tag(word.pos) # Nice shooting, son. What's your name? lemma = wordnet.morphy(word.token, pos=mtag) - if isinstance(lemma, str): + if isinstance(lemma, six.binary_type): # In this case lemma is example-based, because if it's rule based # the result should be unicode (input was unicode). # Since english is ascii the decoding is ok. diff --git a/quepy/quepyapp.py b/quepy/quepyapp.py index e3187d0..d676ff4 100644 --- a/quepy/quepyapp.py +++ b/quepy/quepyapp.py @@ -14,6 +14,7 @@ import logging from importlib import import_module from types import ModuleType +import six from quepy import settings from quepy import generation @@ -35,10 +36,10 @@ def install(app_name): } modules = {} - for module_name, module_path in module_paths.iteritems(): + for module_name, module_path in six.iteritems(module_paths): try: modules[module_name] = import_module(module_path.format(app_name)) - except ImportError, error: + except ImportError as error: message = u"Error importing {0!r}: {1}" raise ImportError(message.format(module_name, error)) diff --git a/quepy/sparql_generation.py b/quepy/sparql_generation.py index 3b1a218..5a168e1 100644 --- a/quepy/sparql_generation.py +++ b/quepy/sparql_generation.py @@ -4,6 +4,7 @@ Sparql generation code. """ +import six from quepy import settings from quepy.dsl import IsRelatedTo from quepy.expression import isnode @@ -13,7 +14,7 @@ def escape(string): - string = unicode(string) + string = six.text_type(string) string = string.replace("\n", "") string = string.replace("\r", "") string = string.replace("\t", "") @@ -29,12 +30,12 @@ def adapt(x): if isnode(x): x = u"?x{}".format(x) return x - if isinstance(x, basestring): + if isinstance(x, six.string_types): assert_valid_encoding(x) if x.startswith(u"\"") or ":" in x: return x return u'"{}"'.format(x) - return unicode(x) + return six.text_type(x) def expression_to_sparql(e, full=False): diff --git a/quepy/tagger.py b/quepy/tagger.py index 557e093..6ff93ad 100644 --- a/quepy/tagger.py +++ b/quepy/tagger.py @@ -8,7 +8,7 @@ # Gonzalo Garcia Berrotaran import logging - +import six from quepy import settings from quepy.encodingpolicy import assert_valid_encoding @@ -50,7 +50,7 @@ def __unicode__(self): return u"|".join(str(x) for x in attrs) def __repr__(self): - return unicode(self) + return six.text_type(self) def get_tagger(): diff --git a/test.sh b/test.sh new file mode 100644 index 0000000..e69de29 diff --git a/tests/random_expression.py b/tests/random_expression.py index d223a07..f5ed665 100644 --- a/tests/random_expression.py +++ b/tests/random_expression.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import random +import six from quepy.expression import Expression @@ -9,18 +10,21 @@ def random_data(only_ascii=False): while first or 1 / 20.0 < random.random(): first = False if only_ascii: - c = unichr(random.randint(33, 126)) + c = six.unichr(random.randint(33, 126)) data.append(c) continue x = random.random() if 0.1 > x: c = random.choice(u" ./\n") elif 0.50 > x: - c = unichr(random.randint(65, 122)) + c = six.unichr(random.randint(65, 122)) elif 0.85 > x: - c = unichr(random.randint(0, 127)) + c = six.unichr(random.randint(0, 127)) else: - c = unichr(random.randint(0, 65535)) + blacklist = [six.unichr(x) for x in range(0xd800, 0xdfff + 1)] + c = blacklist[0] + while c in blacklist: + c = six.unichr(random.randint(0, 65535)) data.append(c) return u"".join(data) @@ -34,6 +38,8 @@ def random_relation(only_ascii=False): class UnicodeableDummy(object): def __unicode__(self): return data + def __str__(self): + return data return UnicodeableDummy() diff --git a/tests/test_dot_generation.py b/tests/test_dot_generation.py index 2b8d68f..8f5d729 100644 --- a/tests/test_dot_generation.py +++ b/tests/test_dot_generation.py @@ -7,9 +7,12 @@ # Authors: Rafael Carrascosa # Gonzalo Garcia Berrotaran -import unittest +from __future__ import print_function +import sys import tempfile import subprocess +import unittest +import six from random_expression import random_expression from random import seed from quepy.dot_generation import expression_to_dot @@ -38,8 +41,8 @@ class X(FixedRelation): class TestDotGeneration(unittest.TestCase): def _standard_check(self, s, e): - self.assertIsInstance(s, unicode) - vs = [u"x{}".format(i) for i in xrange(len(e))] + self.assertIsInstance(s, six.text_type) + vs = [u"x{}".format(i) for i in six.moves.xrange(len(e))] for var in vs: self.assertIn(var, s) @@ -49,12 +52,14 @@ def test_dot_takes_unicode(self): _, s = expression_to_dot(e) self._standard_check(s, e) + @unittest.skipIf(sys.version_info[0] > 2, 'less relevant in py3') def test_dot_takes_fails_ascii1(self): e = gen_fixedtype("a") e += gen_datarel("b", "c") e = gen_fixedrelation("d", e) self.assertRaises(ValueError, expression_to_dot, e) + @unittest.skipIf(sys.version_info[0] > 2, 'less relevant in py3') def test_dot_takes_fails_ascii2(self): e = gen_fixedtype("·̣─@łæßð~¶½") e += gen_datarel("tµŧurułej€", "←ðßðæßđæßæđßŋŋæ @~~·ŋŋ·¶·ŋ“¶¬@@") @@ -66,17 +71,17 @@ def test_dot_stress(self): dot_file = tempfile.NamedTemporaryFile() cmdline = "dot %s" % dot_file.name msg = "dot returned error code {}, check {} input file." - for _ in xrange(100): + for i in six.moves.xrange(100): expression = random_expression() _, dot_string = expression_to_dot(expression) - with open(dot_file.name, "w") as filehandler: + with open(dot_file.name, "wb") as filehandler: filehandler.write(dot_string.encode("utf-8")) try: retcode = subprocess.call(cmdline.split(), stdout=tempfile.TemporaryFile()) except OSError: - print "Warning: the program 'dot' was not found, tests skipped" + print("Warning: the program 'dot' was not found, tests skipped") return if retcode != 0: dot_file.delete = False diff --git a/tests/test_dsl.py b/tests/test_dsl.py index 792fc91..3fbc86e 100644 --- a/tests/test_dsl.py +++ b/tests/test_dsl.py @@ -8,6 +8,7 @@ # Gonzalo Garcia Berrotaran import unittest +import six from quepy.expression import Expression from quepy.dsl import HasKeyword, FixedRelation, FixedType, \ FixedDataRelation @@ -39,9 +40,9 @@ class MyFixedType(FixedType): edges = list(fixedinstance.iter_edges(head)) self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][0], unicode) + self.assertIsInstance(edges[0][0], six.text_type) self.assertEqual(edges[0][0], u"rdf:type") - self.assertIsInstance(edges[0][1], unicode) + self.assertIsInstance(edges[0][1], six.text_type) self.assertEqual(edges[0][1], u"uranium:blowtorch") def test_fixed_data_relation(self): @@ -54,9 +55,9 @@ class MyFixedDataRelation(FixedDataRelation): edges = list(fixedinstance.iter_edges(head)) self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][0], unicode) + self.assertIsInstance(edges[0][0], six.text_type) self.assertEqual(edges[0][0], u"uranium:blowtorch") - self.assertIsInstance(edges[0][1], unicode) + self.assertIsInstance(edges[0][1], six.text_type) self.assertEqual(edges[0][1], u"soplete") def test_has_keyword(self): @@ -67,9 +68,9 @@ def test_has_keyword(self): head = keywordinstance.get_head() edges = list(keywordinstance.iter_edges(head)) self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][0], unicode) + self.assertIsInstance(edges[0][0], six.text_type) self.assertEqual(edges[0][0], u"uranium:keyword") - self.assertIsInstance(edges[0][1], unicode) + self.assertIsInstance(edges[0][1], six.text_type) self.assertEqual(edges[0][1], u'soplete') # With language @@ -79,7 +80,7 @@ def test_has_keyword(self): head = keywordinstance.get_head() edges = list(keywordinstance.iter_edges(head)) self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][1], unicode) + self.assertIsInstance(edges[0][1], six.text_type) self.assertEqual(edges[0][1], u'"soplete"@en') # With sanitize @@ -89,7 +90,7 @@ def test_has_keyword(self): head = keywordinstance.get_head() edges = list(keywordinstance.iter_edges(head)) self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][1], unicode) + self.assertIsInstance(edges[0][1], six.text_type) self.assertEqual(edges[0][1], u'"SOPLETE"@en') diff --git a/tests/test_expressions.py b/tests/test_expressions.py index 639d000..3935525 100644 --- a/tests/test_expressions.py +++ b/tests/test_expressions.py @@ -13,6 +13,7 @@ """ import unittest +import six from quepy.expression import Expression, isnode @@ -35,7 +36,7 @@ def make_canonical_expression(e): if isnode(child): child = canon[child] childs.append((label, child)) - childs.sort() + childs.sort(key=str) canon[node] = tuple(childs) return canon[e.get_head()] @@ -145,7 +146,7 @@ def setUp(self): other.add_data(0, "1") other.add_data(2, "3") other.decapitate("iuju") - for _ in xrange(5): + for _ in six.moves.xrange(5): self.e.decapitate("nouu") self.e += other @@ -237,14 +238,14 @@ def setUp(self): other = Expression() other.decapitate("onelevel") self.a = Expression() - for _ in xrange(5): + for _ in six.moves.xrange(5): self.a.decapitate("step") self.a += other other = Expression() other.decapitate("onelevel", reverse=True) self.b = Expression() - for _ in xrange(5): + for _ in six.moves.xrange(5): self.b.decapitate("step") self.b += other @@ -255,7 +256,7 @@ def setUp(self): other.add_data(0, "data") other.decapitate("onelevel") self.a = Expression() - for _ in xrange(5): + for _ in six.moves.xrange(5): self.a.decapitate("step") self.a += other @@ -263,7 +264,7 @@ def setUp(self): other.add_data(0, "data") other.decapitate("onelevel", reverse=True) self.b = Expression() - for _ in xrange(5): + for _ in six.moves.xrange(5): self.b.decapitate("step") self.b += other diff --git a/tests/test_mql_generation.py b/tests/test_mql_generation.py index d437ce1..51da8fd 100644 --- a/tests/test_mql_generation.py +++ b/tests/test_mql_generation.py @@ -10,6 +10,7 @@ import json from random import seed import unittest +import six from random_expression import random_expression from quepy.mql_generation import generate_mql @@ -34,16 +35,16 @@ def _valid_mql_query(self, query): if isinstance(x, list): self.assertIsInstance(x[0], dict) self.assertEqual(len(x), 1) - for key, value in x[0].iteritems(): - self.assertIsInstance(key, unicode) + for key, value in six.iteritems(x[0]): + self.assertIsInstance(key, six.text_type) q.append(value) else: - self.assertIsInstance(x, unicode) + self.assertIsInstance(x, six.text_type) def _valid_target_for_query(self, target, query): self.assertIsInstance(target, list) for entry in target: - self.assertIsInstance(entry, unicode) + self.assertIsInstance(entry, six.text_type) x = self._get_json(query) if x is None: return @@ -58,7 +59,7 @@ def _valid_target_for_query(self, target, query): def test_mql_stress(self): seed("playadito vs amanda... 3 focas") - for _ in xrange(100): + for _ in six.moves.xrange(100): expression = random_expression() target, mql = generate_mql(expression) self._valid_mql_query(mql) diff --git a/tests/test_nltktagger.py b/tests/test_nltktagger.py index e45e272..53ac2d1 100644 --- a/tests/test_nltktagger.py +++ b/tests/test_nltktagger.py @@ -12,12 +12,14 @@ Tests for nltktagger. """ +import sys import unittest from quepy import nltktagger from quepy.tagger import Word class TestNLTKTagger(unittest.TestCase): + def test_word_output(self): output = nltktagger.run_nltktagger(u"this is a test case «¢ðßæŋħħ") @@ -25,6 +27,7 @@ def test_word_output(self): for word in output: self.assertIsInstance(word, Word) + @unittest.skipIf(sys.version_info[0] > 2, 'less relevant in py3') def tests_wrong_input(self): self.assertRaises(ValueError, nltktagger.run_nltktagger, "this is not unicode") diff --git a/tests/test_quepyapp.py b/tests/test_quepyapp.py index 7beb106..b29ad7c 100644 --- a/tests/test_quepyapp.py +++ b/tests/test_quepyapp.py @@ -13,7 +13,7 @@ """ import unittest - +import six import quepy @@ -26,8 +26,8 @@ def test_get_query_types(self): question = "What is this?" target, query, userdata = self.app.get_query(question) - self.assertIsInstance(target, unicode) - self.assertIsInstance(query, unicode) + self.assertIsInstance(target, six.text_type) + self.assertIsInstance(query, six.text_type) def test_get_user_data(self): question = "user data" diff --git a/tests/test_sparql_generation.py b/tests/test_sparql_generation.py index 836f7f2..f4919f6 100644 --- a/tests/test_sparql_generation.py +++ b/tests/test_sparql_generation.py @@ -7,8 +7,10 @@ # Authors: Rafael Carrascosa # Gonzalo Garcia Berrotaran +import sys import re import unittest +import six from random_expression import random_expression from random import seed from quepy.sparql_generation import expression_to_sparql @@ -42,8 +44,8 @@ class TestSparqlGeneration(unittest.TestCase): re.DOTALL) def _standard_check(self, s, e): - self.assertIsInstance(s, unicode) - vs = [u"x{}".format(i) for i in xrange(len(e))] + self.assertIsInstance(s, six.text_type) + vs = [u"x{}".format(i) for i in six.moves.xrange(len(e))] for var in vs: self.assertIn(var, s) @@ -67,7 +69,7 @@ def test_sparql_takes_unicode(self): @unittest.skip("should be fixed") def test_sparql_ascii_stress(self): seed("sacala dunga dunga dunga") - for _ in xrange(100): + for _ in six.moves.xrange(100): expression = random_expression(only_ascii=True) _, s = expression_to_sparql(expression) self._standard_check(s, expression) @@ -75,7 +77,7 @@ def test_sparql_ascii_stress(self): def test_sparql_stress(self): seed("sacala dunga dunga dunga") - for _ in xrange(100): + for _ in six.moves.xrange(100): expression = random_expression() try: _, s = expression_to_sparql(expression) @@ -86,12 +88,14 @@ def test_sparql_stress(self): self._standard_check(s, expression) self._sparql_check(s) + @unittest.skipIf(sys.version_info[0] > 2, 'less relevant in py3') def test_sparql_takes_fails_ascii1(self): e = gen_fixedtype("a") e += gen_datarel("b", "c") e = gen_fixedrelation("d", e) self.assertRaises(ValueError, expression_to_sparql, e) + @unittest.skipIf(sys.version_info[0] > 2, 'less relevant in py3') def test_sparql_takes_fails_ascii2(self): e = gen_fixedtype("·̣─@łæßð~¶½") e += gen_datarel("tµŧurułej€", "←ðßðæßđæßæđßŋŋæ @~~·ŋŋ·¶·ŋ“¶¬@@") diff --git a/tests/test_tagger.py b/tests/test_tagger.py index 39be54a..5360c2b 100644 --- a/tests/test_tagger.py +++ b/tests/test_tagger.py @@ -12,27 +12,31 @@ Tests for tagger. """ +import sys import unittest +import six + from quepy import tagger class TestTagger(unittest.TestCase): def test_tagset_unicode(self): for tag in tagger.PENN_TAGSET: - self.assertIsInstance(tag, unicode) + self.assertIsInstance(tag, six.text_type) def test_word_encoding(self): word = tagger.Word(token=u"æßđħłłþłłł@æµß", lemma=u"ŧłþłßæ#¶ŋħ~#~@", pos=u"øĸŋøħþ€ĸłþ€øæ«»¢") - self.assertIsInstance(word.token, unicode) + self.assertIsInstance(word.token, six.text_type) self.assertEqual(word.token, u"æßđħłłþłłł@æµß") - self.assertIsInstance(word.lemma, unicode) + self.assertIsInstance(word.lemma, six.text_type) self.assertEqual(word.lemma, u"ŧłþłßæ#¶ŋħ~#~@") - self.assertIsInstance(word.pos, unicode) + self.assertIsInstance(word.pos, six.text_type) self.assertEqual(word.pos, u"øĸŋøħþ€ĸłþ€øæ«»¢") + @unittest.skipIf(sys.version_info[0] > 2, 'less relevant in py3') def test_word_wrong_encoding(self): # Token not unicode self.assertRaises(ValueError, tagger.Word, "æßđħłłþłłł@æµß", @@ -49,13 +53,14 @@ def test_word_attrib_set(self): word.lemma = u"ŧłþłßæ#¶ŋħ~#~@" word.pos = u"øĸŋøħþ€ĸłþ€øæ«»¢" - self.assertIsInstance(word.token, unicode) + self.assertIsInstance(word.token, six.text_type) self.assertEqual(word.token, u"æßđħłłþłłł@æµß") - self.assertIsInstance(word.lemma, unicode) + self.assertIsInstance(word.lemma, six.text_type) self.assertEqual(word.lemma, u"ŧłþłßæ#¶ŋħ~#~@") - self.assertIsInstance(word.pos, unicode) + self.assertIsInstance(word.pos, six.text_type) self.assertEqual(word.pos, u"øĸŋøħþ€ĸłþ€øæ«»¢") + @unittest.skipIf(sys.version_info[0] > 2, 'less relevant in py3') def test_word_wrong_attrib_set(self): word = tagger.Word(u"æßđħłłþłłł@æµß") diff --git a/tests/testapp/__init__.py b/tests/testapp/__init__.py index 03df946..4fef00e 100644 --- a/tests/testapp/__init__.py +++ b/tests/testapp/__init__.py @@ -12,4 +12,4 @@ Init for testapp quepy. """ -from basic import * \ No newline at end of file +from testapp.basic import * \ No newline at end of file