-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_dataset.py
362 lines (287 loc) · 10.4 KB
/
make_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
import pandas as pd
from glob import glob
import re
# --------------------------------------------------------------
# Read single CSV file
# --------------------------------------------------------------
single_file_acc = pd.read_csv(
"../../data/raw/A-bench-heavy_MetaWear_2019-01-14T14.22.49.165_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv"
)
single_file_gyr = pd.read_csv(
"../../data/raw/A-bench-heavy2-rpe8_MetaWear_2019-01-11T16.10.08.270_C42732BE255C_Gyroscope_25.000Hz_1.4.4.csv"
)
# --------------------------------------------------------------
# List all data in data/raw/MetaMotion
# --------------------------------------------------------------
files = glob("../../data/raw/*.csv")
len(files)
# --------------------------------------------------------------
# Extract features from filename
# --------------------------------------------------------------
# Extracting 3 values of file name:
# participant,
# label (exercise) and
# categoty (light or heavy)
f = files[0]
f.split("-")[0]
f.split("-")[1]
f.split("-")[2].split("_")[0]
data_path = "../../data/raw\\"
# Extrating the participant
participant = f.split("-")[0].replace(data_path, "")
# Extracting the label
label = f.split("-")[1]
# Removing number in any string
def remove_numbers(string):
return re.sub(r"\d+", "", string)
category = remove_numbers(f.split("-")[2].split("_")[0]) # Extracting the category
# Adding extra columns
df = pd.read_csv(f)
df["participant"] = f.split("-")[0].replace(data_path, "")
df["label"] = f.split("-")[1]
df["category"] = remove_numbers(f.split("-")[2].split("_")[0])
# --------------------------------------------------------------
# Read all files
# --------------------------------------------------------------
# Create df to store acc anf gyr
acc_df = pd.DataFrame()
gyr_df = pd.DataFrame()
# Creating a set to increment alfter each file is read
acc_set = 1
gyr_set = 1
# Lopping through all files
for f in files:
# Reading the file
df = pd.read_csv(f)
# Extracting the participant
participant = f.split("-")[0].replace(data_path, "")
# Extracting the label
label = f.split("-")[1]
# Removing number in any string
category = remove_numbers(f.split("-")[2].split("_")[0]) # Extracting the category
df["participant"] = participant
df["label"] = label
df["category"] = category
# If exist the word 'accelerometer' in the file name
if "Accelerometer" in f:
# Adding to acc_df
df["set"] = acc_set
acc_df = pd.concat([acc_df, df])
acc_set += 1
if "Gyroscope" in f:
# Adding to gyr_df
df["set"] = gyr_set
gyr_df = pd.concat([gyr_df, df])
gyr_set += 1
# --------------------------------------------------------------
# Working with datetimes
# --------------------------------------------------------------
acc_df.info()
# Converting UNIX time to datetime
pd.to_datetime(df["epoch (ms)"], unit="ms")
# Conveting object to datetime
pd.to_datetime(df["time (01:00)"]).dt.weekday
# Set time to index
acc_df.index = pd.to_datetime(acc_df["epoch (ms)"], unit="ms")
gyr_df.index = pd.to_datetime(gyr_df["epoch (ms)"], unit="ms")
# Removing columns
del acc_df["epoch (ms)"]
del acc_df["time (01:00)"]
del acc_df["elapsed (s)"]
del gyr_df["epoch (ms)"]
del gyr_df["time (01:00)"]
del gyr_df["elapsed (s)"]
acc_df.query('participant == "E"').query('label == "row" ').query(
'category == "medium" '
)
# --------------------------------------------------------------
# Turn into a single function (read_data_from_files)
# --------------------------------------------------------------
files = glob("../../data/raw/*.csv")
data_path = "../../data/raw\\"
# Removing number in any string
def remove_numbers(string):
return re.sub(r"\d+", "", string)
def read_data_from_files(files: list, data_path: str) -> tuple:
"""
Read data from a list of files and extract & clean accelerometer and gyroscope data.
Args:
files (list): List of file paths to read data from.
data_path (str): Path to the data directory.
Returns:
tuple: A tuple containing two DataFrames - `acc_df` (accelerometer data) and `gyr_df` (gyroscope data).
"""
# Create empty DataFrames to store accelerometer and gyroscope data
acc_df = pd.DataFrame()
gyr_df = pd.DataFrame()
# Set initial values for set counters
acc_set = 1
gyr_set = 1
# Loop through all files
for f in files:
# Reading the file
df = pd.read_csv(f)
# Extracting participant, label, and category from the file name
participant = f.split("-")[0].replace(data_path, "")
label = f.split("-")[1]
category = remove_numbers(f.split("-")[2].split("_")[0])
# Add participant, label, and category columns to the DataFrame
df["participant"] = participant
df["label"] = label
df["category"] = category
# Check if the file contains accelerometer data
if "Accelerometer" in f:
# Add set number to the DataFrame
df["set"] = acc_set
# Concatenate the DataFrame to the acc_df
acc_df = pd.concat([acc_df, df])
# Increment the acc_set counter
acc_set += 1
# Check if the file contains gyroscope data
if "Gyroscope" in f:
# Add set number to the DataFrame
df["set"] = gyr_set
# Concatenate the DataFrame to the gyr_df
gyr_df = pd.concat([gyr_df, df])
# Increment the gyr_set counter
gyr_set += 1
# Set time as the DataFrame index
acc_df.index = pd.to_datetime(acc_df["epoch (ms)"], unit="ms")
gyr_df.index = pd.to_datetime(gyr_df["epoch (ms)"], unit="ms")
# Remove unnecessary columns from the DataFrames
columns_to_remove = ["epoch (ms)", "time (01:00)", "elapsed (s)"]
acc_df.drop(columns_to_remove, axis=1, inplace=True)
gyr_df.drop(columns_to_remove, axis=1, inplace=True)
return acc_df, gyr_df
acc_df, gyr_df = read_data_from_files(files, data_path)
# --------------------------------------------------------------
# Merging datasets in a single dataframe
# --------------------------------------------------------------
# Selecting just first 3 columns from acc_df
data_merged = pd.concat([acc_df.iloc[:, :3], gyr_df], axis=1)
# Rename columns
data_merged.columns = [
"acc_x",
"acc_y",
"acc_z",
"gyr_x",
"gyr_y",
"gyr_z",
"participant",
"label",
"category",
"set",
]
# Dropping the nan values
# There are two sensors measuring at diferent frequencies
# The change that the sensors measurement exact at same time is small
data_merged
data_merged.dropna(inplace=False)
# --------------------------------------------------------------
# Resample data (frequency conversion)
# --------------------------------------------------------------
# Accelerometer: 12.500HZ
# Gyroscope: 25.000Hz
aggregation = {
"acc_x": "mean",
"acc_y": "mean",
"acc_z": "mean",
"gyr_x": "mean",
"gyr_y": "mean",
"gyr_z": "mean",
"participant": "last",
"label": "last",
"category": "last",
"set": "last",
}
# Resample the DataFrame
data_merged[:1000].resample(rule="200ms").agg(aggregation, errors="ignore")
# tGenerates a list called df_by_days, where each element represents a DataFrame
# for a specific day. The groupby() operation allows you to group the original DataFrame
# data_merged by day, and then the list comprehension extracts and collects the
# DataFrames into the df_by_days list
df_by_days = [df_by_day for day, df_by_day in data_merged.groupby(pd.Grouper(freq="D"))]
# Takes the list of DataFrames df_by_days, performs resampling and aggregation on each
# DataFrame, drops missing values, and then concatenates the resampled and aggregated
# DataFrames into a single DataFrame called data_resampled.
data_resampled = pd.concat(
[
df.resample(rule="200ms").agg(aggregation, errors="ignore").dropna()
for df in df_by_days
]
)
# this code takes the original DataFrame data_merged, groups it by day,
# and then performs resampling, aggregation, and concatenation on each daily DataFrame
# to create a single DataFrame data_resampled that contains the resampled and aggregated
# data for each day.
# --------------------------------------------------------------
# Turn into a single function (resample_and_aggregate_data)
# --------------------------------------------------------------
def resample_and_aggregate_data(
acc: pd.DataFrame,
gyr: pd.DataFrame,
columns_name: list,
rule: str,
aggregation: dict,
) -> pd.DataFrame:
"""
Resample and aggregate accelerometer and gyroscope data based on a specified rule and aggregation dictionary.
Args:
acc (pd.DataFrame): The accelerometer data.
gyr (pd.DataFrame): The gyroscope data.
columns_name (list): The column names for the merged DataFrame.
rule (str): The resampling rule.
aggregation (dict): The aggregation dictionary specifying the columns and aggregation methods.
Returns:
pd.DataFrame: The resampled and aggregated DataFrame.
"""
# Selecting just the first 3 columns from acc_df and combining with gyr_df
data_merged = pd.concat([acc.iloc[:, :3], gyr], axis=1)
# Rename columns
data_merged.columns = columns_name
# Group by day
df_by_days = [
df_by_day for day, df_by_day in data_merged.groupby(pd.Grouper(freq="D"))
]
# Resample and concate each day's DataFrame
data_resampled = pd.concat(
[
df.resample(rule=rule).agg(aggregation, errors="ignore").dropna()
for df in df_by_days
]
)
data_resampled["set"] = data_resampled["set"].astype("int")
return data_resampled
columns_name = [
"acc_x",
"acc_y",
"acc_z",
"gyr_x",
"gyr_y",
"gyr_z",
"participant",
"label",
"category",
"set",
]
aggregation = {
"acc_x": "mean",
"acc_y": "mean",
"acc_z": "mean",
"gyr_x": "mean",
"gyr_y": "mean",
"gyr_z": "mean",
"participant": "last",
"label": "last",
"category": "last",
"set": "last",
}
rule = "200ms"
data_resampled = resample_and_aggregate_data(
acc_df, gyr_df, columns_name, rule, aggregation
)
data_resampled.info()
# --------------------------------------------------------------
# Export dataset
# --------------------------------------------------------------
data_resampled.to_pickle("../../data/interim/01_data_resampled.pkl")