-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_actor_clustering.py
101 lines (78 loc) · 3.63 KB
/
train_actor_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
"""
Created on Fri May 1 10:19:52 2020
@author: James
"""
import pandas as pd;
import numpy as np;
import pickle;
import random;
from motives.run_motive_classifier import classifyDF;
from topics.run_topic_quantifier import cluster_topics;
from methods.run_method_encoder import tweet_info;
from sklearn.neighbors import KNeighborsClassifier;
from sklearn import metrics;
# Combine all three measurements for KNeighborsClassifier
def combine_measures(dataframe):
motive_classes = classifyDF(dataframe, "motives/models/model.pickle");
topics_quantified = cluster_topics(dataframe["tweet_text"].tolist(), "topics/models/model_001.pickle");
methods_encoded = tweet_info(dataframe);
result = [];
for index in range(len(motive_classes)):
append = motive_classes[index];
append.extend([topics_quantified[index]]);
append.extend(methods_encoded[index]);
result.append(append);
return np.asarray(result);
def train_model():
# Get measures of all tweets from dataframe.
# Read the CSV file mapping all tweet data to a motive.
labeledDataPath = "data/actors_and_motives.csv";
df = pd.read_csv(labeledDataPath, usecols=["tweet_docs", "apm"], converters={"tweet_docs": lambda x: x.strip("[]").split(", ")});
# Removes leading and ending quote characters
df["tweet_docs"] = [[x.strip('\"') for x in df["tweet_docs"][i]] for i in range(len(df["tweet_docs"]))];
# Create the training and testing datasets
TRAIN_SPLIT = 250;
MAX_LINES = 300;
RANDOM_SELECTION = 200;
dataset = [];
# Get all file paths and their associated motives from the dataframe.
print("GATHERING DATASET...")
counter = 0;
for i in range(len(df["tweet_docs"])):
for j in range(len(df["tweet_docs"][i])):
read_dataframe = pd.read_csv(df["tweet_docs"][i][j], nrows=MAX_LINES);
if RANDOM_SELECTION < len(read_dataframe):
read_dataframe = read_dataframe.sample(n=RANDOM_SELECTION);
dataset += [(measure, df["apm"][i]) for measure in combine_measures(read_dataframe)];
counter += 1;
print("Loading Dataset " + str(counter));
print("SHUFFLING DATASET...")
random.shuffle(dataset);
x_training, y_training = np.asarray([point[0] for point in dataset[TRAIN_SPLIT:]]), np.asarray([point[1] for point in dataset[TRAIN_SPLIT:]]);
x_testing, y_testing = np.asarray([point[0] for point in dataset[:TRAIN_SPLIT]]), np.asarray([point[1] for point in dataset[:TRAIN_SPLIT]]);
# Create, train, and test for the best classifier.
k_range = range(1, 25);
best_model = None;
best_score = 0;
chosen_k = 1;
# Loop through several values of k and find the best performing model.
for k in k_range:
print("TRAINING ON K-VALUE: " + str(k));
knn = KNeighborsClassifier(n_neighbors=k);
knn.fit(x_training, y_training);
prediction_testing = knn.predict(x_testing);
score = metrics.accuracy_score(y_testing, prediction_testing);
print("\tACCURACY SCORE: " + str(score));
if score > best_score:
best_model = knn;
best_score = score;
chosen_k = k;
print("\n\nBEST MODEL SCORE: " + str(best_score));
print("BEST K-VALUE: " + str(chosen_k));
# Save the classifier.
save_file = open('models/sampled_' + str(RANDOM_SELECTION) + '_actors_model_k' + str(chosen_k) + '.pickle', 'wb');
pickle.dump(best_model, save_file);
save_file.close();
print("MODEL SAVED");
# train_model();