Skip to content

Commit

Permalink
updated to python3.7; minor bugs fiexed
Browse files Browse the repository at this point in the history
  • Loading branch information
siyuanzhao committed Sep 2, 2019
1 parent 0b932d3 commit cda24d8
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 24 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Automated Essay Grading
Source code for the paper [A Memory-Augmented Neural Model for Automated Grading](http://dl.acm.org/citation.cfm?doid=3051457.3053982) in L@S 2017.

**Note** that recent check-in updates the python from python 2.5 to python 3.7.
![Model Structure](AES-Model.png)

The dataset comes from Kaggle ASAP competition. You can download the data from the link below.
Expand All @@ -25,6 +25,7 @@ git clone https://github.com/siyuanzhao/automated-essay-grading.git
* Tensorflow 1.10
* scikit-learn 0.19
* six 1.10.0
* **python 3.7**

### Usage
```
Expand Down
32 changes: 16 additions & 16 deletions cv_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
import sys
import pandas as pd

print 'start to load flags\n'
print( 'start to load flags\n')

# flags
tf.flags.DEFINE_float("epsilon", 0.1, "Epsilon value for Adam Optimizer.")
tf.flags.DEFINE_float("l2_lambda", 0.3, "Lambda for l2 loss.")
tf.flags.DEFINE_float("learning_rate", 0.002, "Learning rate")
tf.flags.DEFINE_float("learning_rate", 0.001, "Learning rate")
tf.flags.DEFINE_float("max_grad_norm", 10.0, "Clip gradients to this norm.")
tf.flags.DEFINE_float("keep_prob", 0.9, "Keep probability for dropout")
tf.flags.DEFINE_integer("evaluation_interval", 2, "Evaluate and print results every x epochs")
tf.flags.DEFINE_integer("evaluation_interval", 5, "Evaluate and print results every x epochs")
tf.flags.DEFINE_integer("batch_size", 15, "Batch size for training.")
tf.flags.DEFINE_integer("feature_size", 100, "Feature size")
tf.flags.DEFINE_integer("num_samples", 1, "Number of samples selected from training for each score")
Expand Down Expand Up @@ -66,13 +66,13 @@
print("Writing to {}\n".format(out_dir))

print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
print("{}={}".format(attr.upper(), value))
for key in sorted(FLAGS.__flags.keys()):
print("{}={}".format(key, getattr(FLAGS, key)))
print("")

with open(out_dir+'/params', 'w') as f:
for attr, value in sorted(FLAGS.__flags.items()):
f.write("{}={}".format(attr.upper(), value))
for key in sorted(FLAGS.__flags.keys()):
f.write("{}={}".format(key, getattr(FLAGS, key)))
f.write("\n")

# hyper-parameters end here
Expand All @@ -86,7 +86,7 @@
elif essay_set_id == 8:
min_score, max_score = 0, 60

print 'max_score is {} \t min_score is {}\n'.format(max_score, min_score)
print( 'max_score is {} \t min_score is {}\n'.format(max_score, min_score))
with open(out_dir+'/params', 'a') as f:
f.write('max_score is {} \t min_score is {} \n'.format(max_score, min_score))

Expand All @@ -101,15 +101,15 @@
vocab_size = len(word_idx) + 1
# stat info on data set

sent_size_list = map(len, [essay for essay in essay_list])
sent_size_list = list(map(len, [essay for essay in essay_list]))
max_sent_size = max(sent_size_list)
mean_sent_size = int(np.mean(map(len, [essay for essay in essay_list])))
mean_sent_size = int(np.mean(sent_size_list))

print 'max sentence size: {} \nmean sentence size: {}\n'.format(max_sent_size, mean_sent_size)
print( 'max sentence size: {} \nmean sentence size: {}\n'.format(max_sent_size, mean_sent_size))
with open(out_dir+'/params', 'a') as f:
f.write('max sentence size: {} \nmean sentence size: {}\n'.format(max_sent_size, mean_sent_size))

print 'The length of score range is {}'.format(len(score_range))
print( 'The length of score range is {}'.format(len(score_range)))
E = data_utils.vectorize_data(essay_list, word_idx, max_sent_size)

labeled_data = zip(E, resolved_scores, sent_size_list)
Expand Down Expand Up @@ -198,14 +198,14 @@ def test_step(e, m):
# bad naming
train_scores_encoding = train_scores
else:
train_scores_encoding = map(lambda x: score_range.index(x), train_scores)
train_scores_encoding = list(map(lambda x: score_range.index(x), train_scores))

# data size
n_train = len(trainE)
n_test = len(testE)

print 'The size of training data: {}'.format(n_train)
print 'The size of testing data: {}'.format(n_test)
print( 'The size of training data: {}'.format(n_train))
print( 'The size of testing data: {}'.format(n_test))
with open(out_dir+'/params{}'.format(fold_count), 'a') as f:
f.write('The size of training data: {}\n'.format(n_train))
f.write('The size of testing data: {}\n'.format(n_test))
Expand Down Expand Up @@ -266,7 +266,7 @@ def test_step(e, m):
_, cost, time_spent = train_step(batched_memory, e, s, mem_atten_encoding)
total_time += time_spent
train_cost += cost
print 'Finish epoch {}, total training cost is {}, time spent is {}'.format(i, train_cost, total_time)
print( 'Finish epoch {}, total training cost is {}, time spent is {}'.format(i, train_cost, total_time))
# evaluation
if i % FLAGS.evaluation_interval == 0 or i == FLAGS.epochs:
# test on training data
Expand Down
10 changes: 5 additions & 5 deletions data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from collections import Counter

def load_training_data(training_path, essay_set=1):
training_df = pd.read_csv(training_path, delimiter='\t')
training_df = pd.read_csv(training_path, delimiter='\t', encoding = "ISO-8859-1")
# resolved score for essay set 1
resolved_score = training_df[training_df['essay_set'] == essay_set]['domain1_score']
essay_ids = training_df[training_df['essay_set'] == essay_set]['essay_id']
Expand All @@ -30,12 +30,12 @@ def load_glove(token_num=6, dim=50):
for line in f:
l = line.split()
word = l[0]
vector = map(float, l[1:])
vector = list(map(float, l[1:]))
word_idx[word] = count
word2vec.append(vector)
count += 1

print "==> glove is loaded"
print( "==> glove is loaded")

return word_idx, word2vec

Expand All @@ -46,7 +46,7 @@ def tokenize(sent):
>>> tokenize('I don't know')
['I', 'don', '\'', 'know']
'''
return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]
return [x.strip() for x in re.split('(\W+)', sent) if x.strip()]

def clean_str(string):
"""
Expand Down Expand Up @@ -76,7 +76,7 @@ def build_vocab(sentences, vocab_limit):
"""
# Build vocabulary
word_counts = Counter(itertools.chain(*sentences))
print 'Total size of vocab is {}'.format(len(word_counts.most_common()))
print( 'Total size of vocab is {}'.format(len(word_counts.most_common())))
# Mapping from index to word
# vocabulary_inv = [x[0] for x in word_counts.most_common(vocab_limit)]
vocabulary_inv = [x[0] for x in word_counts.most_common(vocab_limit)]
Expand Down
4 changes: 2 additions & 2 deletions memn2n_kv.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,15 +218,15 @@ def _build_inputs(self):
def _key_addressing(self, mkeys, mvalues, questions, r_list):
self.mem_attention_probs = []
with tf.variable_scope(self._name):
questions = tf.nn.dropout(questions, self.keep_prob)
questions = tf.nn.dropout(questions, rate=1-self.keep_prob)
# [feature_size, batch_size]
u_o = tf.matmul(self.A, questions, transpose_b=True)
u = [u_o]
hop_probs = []
for _ in range(self._hops):
R = r_list[_]
u_temp = u[-1]
mk_temp = tf.nn.dropout(mkeys, self.keep_prob)
mk_temp = tf.nn.dropout(mkeys, rate=1-self.keep_prob)
# [reader_size, batch_size x memory_size]
k_temp = tf.reshape(tf.transpose(mk_temp, [2, 0, 1]), [self.reader_feature_size, -1])
# [feature_size, batch_size x memory_size]
Expand Down

0 comments on commit cda24d8

Please sign in to comment.