-
Notifications
You must be signed in to change notification settings - Fork 1
/
extract_annotations_interim.py
129 lines (96 loc) · 6.23 KB
/
extract_annotations_interim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
import json
import os
import argparse
from collections import Counter
def get_subtype(split='train'):
if split in ['train', 'val']:
return split + '2014'
else:
return 'test2015'
def get_image_name_old(subtype='COCO', image_id='1', format='%s/COCO_%s_%012d.jpg'):
return format % (subtype, subtype, image_id)
def get_image_name(subtype='train2014', image_id='1', format='COCO_%s_%012d.jpg'):
return format % (subtype, image_id)
def interim(questions, split='train', annotations=[],image_format = 'mscocoa/extracted_{}2014/COCO_{}2014_{:0>12}.png'):
print('Interim', split)
data = []
for i in range(len(questions)):
row = {}
row['question_id'] = questions[i]['question_id']
# row['image_name'] = get_image_name(get_subtype(split), questions[i]['image_id'])
row['image_name'] = image_format.format(split,split,questions[i]['image_id'])
row['question'] = questions[i]['question']
if split in ['train', 'val', 'trainval']:
row['answer'] = annotations[i]['multiple_choice_answer']
answers = []
for ans in annotations[i]['answers']:
answers.append(ans['answer'])
row['answers_occurence'] = Counter(answers).most_common()
data.append(row)
return data
def extract_annotations(data_dir = './'):
path_train_qa = os.path.join(data_dir, 'interim', 'train_questions_annotations.json')
path_val_qa = os.path.join(data_dir, 'interim', 'val_questions_annotations.json')
path_trainval_qa = os.path.join(data_dir, 'interim', 'trainval_questions_annotations.json')
path_test_q = os.path.join(data_dir, 'interim', 'test_questions.json')
path_testdev_q = os.path.join(data_dir, 'interim', 'testdev_questions.json')
print('Loading annotations and questions...')
annotations_train_1 = json.load(
open(os.path.join(data_dir, 'Annotations', 'v2_mscoco_train2014_annotations.json'), 'r'))
annotations_train_2 = json.load(
open(os.path.join(data_dir, 'Annotations', 'abstract_v002_train2015_annotations.json'), 'r'))
annotations_train_3 = json.load(
open(os.path.join(data_dir, 'Annotations', 'abstract_v002_train2017_annotations.json'), 'r'))
questions_train_1 = json.load(
open(os.path.join(data_dir, 'Questions', 'v2_OpenEnded_mscoco_train2014_questions.json'), 'r'))
questions_train_2 = json.load(
open(os.path.join(data_dir, 'Questions', 'OpenEnded_abstract_v002_train2015_questions.json'), 'r'))
questions_train_3 = json.load(
open(os.path.join(data_dir, 'Questions', 'OpenEnded_abstract_v002_train2017_questions.json'), 'r'))
annotations_val_1 = json.load(open(os.path.join(data_dir, 'Annotations', 'v2_mscoco_val2014_annotations.json'), 'r'))
annotations_val_2 = json.load(
open(os.path.join(data_dir, 'Annotations', 'abstract_v002_val2015_annotations.json'), 'r'))
annotations_val_3 = json.load(
open(os.path.join(data_dir, 'Annotations', 'abstract_v002_val2017_annotations.json'), 'r'))
questions_val_1 = json.load(
open(os.path.join(data_dir, 'Questions', 'v2_OpenEnded_mscoco_val2014_questions.json'), 'r'))
questions_val_2 = json.load(
open(os.path.join(data_dir, 'Questions', 'OpenEnded_abstract_v002_val2015_questions.json'), 'r'))
questions_val_3 = json.load(
open(os.path.join(data_dir, 'Questions', 'OpenEnded_abstract_v002_val2017_questions.json'), 'r'))
questions_test = json.load(
open(os.path.join(data_dir, 'Questions', 'v2_OpenEnded_mscoco_test2015_questions.json'), 'r'))
question_test_dev = json.load(
open(os.path.join(data_dir, 'Questions', 'v2_OpenEnded_mscoco_test-dev2015_questions.json'), 'r'))
val_merge_1 = interim(questions=questions_val_1['questions'], annotations=annotations_val_1['annotations'],
split='val',image_format='mscocoa/extracted_{}2014/COCO_{}2014_{:0>12}.jpg')
val_merge_2 = interim(questions=questions_val_2['questions'], annotations=annotations_val_2['annotations'],
split='val',image_format='abstract_v002/extracted_{}2015/abstract_v002_{}2015_{:0>12}.png')
val_merge_3 = interim(questions=questions_val_3['questions'], annotations=annotations_val_3['annotations'],
split='val',image_format='abstract_v002/extracted_scene_img_abstract_v002_{}2017/abstract_v002_{}2015_{:0>12}.png')
train_merge_1 = interim(questions=questions_train_1['questions'], annotations=annotations_train_1['annotations'],
split='train',image_format='mscocoa/extracted_{}2014/COCO_{}2014_{:0>12}.jpg')
train_merge_2 = interim(questions=questions_train_2['questions'], annotations=annotations_train_2['annotations'],
split='train', image_format='abstract_v002/extracted_{}2015/abstract_v002_{}2015_{:0>12}.png')
train_merge_3 = interim(questions=questions_train_3['questions'], annotations=annotations_train_3['annotations'],
split='train',image_format='abstract_v002/extracted_scene_img_abstract_v002_{}2017/abstract_v002_{}2015_{:0>12}.png')
testset = interim(questions=questions_test['questions'], split='test',image_format='mscocoa/extracted_test{}/COCO_{}2015_{:0>12}.png')
test_dev_set = interim(questions=question_test_dev['questions'],split='test_dev',image_format='mscocoa/extracted_test{}/COCO_{}2015_{:0>12}.png')
trainset = train_merge_1 + train_merge_2 + train_merge_3
valset = val_merge_1 + val_merge_2 + val_merge_3
trainval = trainset + valset
if not os.path.exists(os.path.join(data_dir, 'interim')):
os.makedirs(os.path.join(data_dir, 'interim'))
json.dump(trainset, open(path_train_qa, 'w'))
json.dump(valset, open(path_val_qa, 'w'))
json.dump(testset,open(path_test_q,'w'))
json.dump(trainval,open(path_trainval_qa,'w'))
json.dump(test_dev_set,open(path_testdev_q,'w'))
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Data Dir')
parser.add_argument('--folder', required=False, default='./',
help='The path to the data directory. (default : current) if you want to install in data/')
args = parser.parse_args()
extract_annotations(args.folder)