-
Notifications
You must be signed in to change notification settings - Fork 1
/
tool.py
61 lines (56 loc) · 1.96 KB
/
tool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from pathlib import Path
from pretreatment.parser import line_parser
import numpy as np
from pretreatment.tool import pad_array
import random
import re
split_re = re.compile(r"([^a-zA-Z0-9])")
def resource(dir_name):
res = [ [open(f, 'rb').readlines(), 0 if "white_lee" in str(f) else 1,str(f)] for f in Path(dir_name).iterdir()]
# for f in Path(dir_name).iterdir():
# print(f)
# exit()
random.shuffle(res)
return res
def spliter(code):
"""
:param code: 一篇代码
:return: 分割后的代码
"""
# print("+++++++++")
# print(code)
code = re.sub(r"^b'\s.*/\*.*$", "", str(code))
code = re.sub(r"^b'\s.*\*.*$", "", str(code))
code = re.sub(r"b'.*\*/$", "", str(code))
code = re.sub(r"^b'\s.*\*/.*$", "", str(code))
code = re.sub(r"//.*$", "", str(code))
code = re.sub(r"<!--.*$", "", str(code))
code = re.sub(r"<%/\*.*$", "", str(code))
code = re.sub(r"<%--.*$", "", str(code))
code = re.sub(r"^b'\s.*\*.*$", "", str(code))
# print("==========")
# print(code)
# exit()
code = str(code).strip("b'").strip(r"\\r\\n").strip(r"\r\n")
return [w for w in split_re.split(code) if w and w != " "]
@line_parser(name="vector", max_length=250)
def word_vector(resource, embedding):
max_sentence = 10
matrix = []
for line in resource[0]:
word_ids = [embedding.get_index(c) for c in spliter(line)[0:250]]
pad_vector = pad_array(np.array(word_ids), 250, embedding.padding_word)
matrix.append(pad_vector)
if len(matrix) == max_sentence:
break
for _ in range(10 - len(matrix)):
matrix.append(np.ones(250) * embedding.padding_word)
#print(np.array(matrix).shape)
#print(matrix)
return np.array(matrix)
@line_parser(name="label", max_length=0)
def label(resource, embedding):
return np.array([resource[1]])
@line_parser(name="filename", max_length=100)
def file(resource,embedding):
return np.array([resource[2]])