-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
98 lines (71 loc) · 3.03 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
import re
import shutil
import textract
from nltk import SnowballStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
def load_word(_file_path=""):
if _file_path == "":
return []
_file = open(_file_path, 'r')
_text = _file.read()
_file.close()
_words = word_tokenize(_text)
return _words
# setting values
input_dir_path = "curriculum_vitae_data/word/"
output_dir_path = "output/"
names_file_path = 'resource/human_names.txt'
cities_names_file_path = 'resource/common_cities_state_countries_names.txt'
specific_stopwords_file_path = "resource/specific_stopwords.txt"
tokenizer = '[a-zA-Z\']+'
punctuation = ['.', ',', '\"', '\'', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '%']
# load all files(pdf/word files, names, specific stopwords)
human_names = load_word(names_file_path)
cities_names = load_word(cities_names_file_path)
remove_keywords = load_word(specific_stopwords_file_path)
filenames = os.listdir(input_dir_path)
docs = [textract.process(input_dir_path + filename) for filename in filenames]
# preprocessing
temp = [re.sub(r'\b\w{1,2}\b', '', doc) for doc in docs]
docs = temp
unicode_docs = [unicode(doc, 'utf-8') for doc in docs]
stop_words = text.ENGLISH_STOP_WORDS.union(punctuation)
stop_words = stop_words.union(remove_keywords)
stop_words = stop_words.union(human_names)
stop_words = stop_words.union(cities_names)
stop_words = frozenset([unicode(_word, 'utf-8') for _word in stop_words])
stop_words = stop_words.union(stopwords.words('english'))
stemmer = SnowballStemmer('english', ignore_stopwords=False)
class StemmedTfidfVectorizer(TfidfVectorizer):
def __init__(self, _stemmer, *args, **kwargs):
super(StemmedTfidfVectorizer, self).__init__(*args, **kwargs)
self.stemmer = _stemmer
def build_analyzer(self):
analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
return lambda _doc: (self.stemmer.stem(word) for word in analyzer(_doc.replace('\n', ' ')))
# TF-IDF, features extraction
vectorizer = StemmedTfidfVectorizer(_stemmer=stemmer, stop_words=stop_words, token_pattern=tokenizer)
X = vectorizer.fit_transform(unicode_docs)
word_features = vectorizer.get_feature_names()
# clustering
km = KMeans(n_clusters=50, init='k-means++', max_iter=1000, n_init=1)
km.fit(X)
# output generation
output_subdir_names = []
common_words = km.cluster_centers_.argsort()[:, -1:-15:-1]
for num, centroid in enumerate(common_words):
output_subdir_names.append(', '.join(word_features[word] for word in centroid))
output_subdir_paths = [output_dir_path + _output_subdir_name for _output_subdir_name in output_subdir_names]
if os.path.exists(output_dir_path):
shutil.rmtree(output_dir_path)
os.mkdir(output_dir_path)
for _output_subdir_path in output_subdir_paths:
if not os.path.exists(_output_subdir_path):
os.makedirs(_output_subdir_path)
for index, label in enumerate(km.labels_):
shutil.copy(input_dir_path + filenames[index], output_subdir_paths[label])