Skip to content

Commit

Permalink
[UPDATE]
Browse files Browse the repository at this point in the history
  • Loading branch information
vmspereira committed Aug 30, 2023
1 parent 167d430 commit 3fbd99d
Show file tree
Hide file tree
Showing 17 changed files with 317 additions and 107 deletions.
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,27 @@ The _script_ folder contains some notebooks to test your code.

## ML Algorithms

### Pre-processing

- Standard Scaler
- Variance Threshold
- Select K-best


### Unsupervised

- Principal Component Analysis
- K-means Clustering

### Supervised

- Linear regression
- Logistic regression
- Naive Bayesian
- Decision Tree
- Random Forest
- k-Nearest Neighbors
- SVM


- Neural Networks
Expand All @@ -59,10 +66,12 @@ The _script_ folder contains some notebooks to test your code.
- Conv2D (using Img2Col)
- MaxPooling2D
- DropOut
- BatchNormalization
- RNN


- Grid Search
- Voting Ensemble
- Bagging Ensemble
- Cross Validation

## License
Expand Down
106 changes: 71 additions & 35 deletions scripts/eval4.ipynb

Large diffs are not rendered by default.

27 changes: 16 additions & 11 deletions scripts/eval5.ipynb

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions src/si/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@
import numpy as np
from ..util.util import label_gen

__all__ = ['Dataset']


class Dataset:

def __init__(self, X=None, y=None,
xnames: list = None,
yname: str = None):
Expand Down Expand Up @@ -110,6 +109,9 @@ def toDataframe(self):
columns = self._xnames[:]
return pd.DataFrame(fullds, columns=columns)

def __repr_html__(self) -> str:
return self.toDataframe().to_html()

def getXy(self):
return self.X, self.y

Expand Down
76 changes: 52 additions & 24 deletions src/si/supervised/dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@
import numpy as np


def entropy(y):
"""Calculates the entropy"""
hist = np.bincount(y)
ps = hist / len(y)
return -np.sum([p * np.log2(p) for p in ps if p > 0])


def gini(probas):
"""Calculates gini criterion"""
return 1 - np.sum(probas**2)


class Node:
"""Implementation of a simple binary tree for DT classifier."""

Expand All @@ -20,7 +32,7 @@ def __init__(self):
# derived from splitting criteria
self.column = None
self.threshold = None
# probability for object inside the Node to belong
# probability for object inside the Node to belong
# for each of the given classes
self.probas = None
# depth of the given node
Expand All @@ -30,8 +42,10 @@ def __init__(self):


class DecisionTree(Model):
def __init__(
self, max_depth: int = 3, min_samples_leaf: int = 1, min_samples_split: int = 2
) -> None:

def __init__(self, max_depth=3, min_samples_leaf=1, min_samples_split=2):
super().__init__()
self.max_depth = max_depth
self.min_samples_leaf = min_samples_leaf
Expand All @@ -50,19 +64,30 @@ def node_probs(self, y):
probas.append(proba)
return np.asarray(probas)

def gini(self, probas):
"""Calculates gini criterion"""
return 1 - np.sum(probas**2)

def calc_impurity(self, y):
'''Wrapper for the impurity calculation. Calculates probas
first and then passses them
to the Gini criterion.
'''
return self.gini(self.node_probs(y))
"""
Wrapper for the impurity calculation. Calculates probabilities
first and then passses them to the Gini criterion.
The gini impurity measures the frequency at which any element
of the dataset will be mislabelled when it is randomly labeled.
The minimum value of the Gini Index is 0. This happens when the
node is pure, this means that all the contained elements in the
node are of one unique class. Therefore, this node will not be
split again. Thus, the optimum split is chosen by the features
with less Gini Index. Moreover, it gets the maximum value when
the probability of the two classes are the same.
"""
return gini(self.node_probs(y))

# TODO: Entropy criterion #######################################

def calc_best_split(self, X, y):
'''Calculates the best possible split for the concrete node of the tree'''
"""
Calculates the best possible split for the concrete node of the tree.
"""

bestSplitCol = None
bestThresh = None
Expand All @@ -89,8 +114,9 @@ def calc_best_split(self, X, y):

# calculate information gain
infoGain = impurityBefore
infoGain -= (impurityLeft * y_left.shape[0] / y.shape[0]) + \
(impurityRight * y_right.shape[0] / y.shape[0])
infoGain -= (impurityLeft * y_left.shape[0] / y.shape[0]) + (
impurityRight * y_right.shape[0] / y.shape[0]
)

# is this infoGain better then all other?
if infoGain > bestInfoGain:
Expand All @@ -111,9 +137,9 @@ def calc_best_split(self, X, y):
return bestSplitCol, bestThresh, x_left, y_left, x_right, y_right

def build_dt(self, X, y, node):
'''
"""
Recursively builds decision tree from the top to bottom
'''
"""
# checking for the terminal conditions
if node.depth >= self.max_depth:
node.is_terminal = True
Expand All @@ -133,7 +159,10 @@ def build_dt(self, X, y, node):
if splitCol is None:
node.is_terminal = True

if x_left.shape[0] < self.min_samples_leaf or x_right.shape[0] < self.min_samples_leaf:
if (
x_left.shape[0] < self.min_samples_leaf
or x_right.shape[0] < self.min_samples_leaf
):
node.is_terminal = True
return

Expand Down Expand Up @@ -166,11 +195,11 @@ def fit(self, dataset):
self.is_fitted = True

def predict_sample(self, x, node):
'''
Passes one object through decision tree and return the probability of
"""
Passes one object through decision tree and return the probability of
it to belong to each class
'''
assert self.is_fitted, 'Model must be fit before predicting'
"""
assert self.is_fitted, "Model must be fit before predicting"
# if we have reached the terminal node of the tree
if node.is_terminal:
return node.probas
Expand All @@ -182,14 +211,13 @@ def predict_sample(self, x, node):
return probas

def predict(self, x):
assert self.is_fitted, 'Model must be fit before predicting'
assert self.is_fitted, "Model must be fit before predicting"
pred = np.argmax(self.predict_sample(x, self.Tree))
return pred

def cost(self, X=None, y=None):
X = X if X is not None else self.dataset.X
y = y if y is not None else self.dataset.y

y_pred = np.ma.apply_along_axis(self.predict,
axis=0, arr=X.T)
y_pred = np.ma.apply_along_axis(self.predict, axis=0, arr=X.T)
return accuracy_score(y, y_pred)
2 changes: 1 addition & 1 deletion src/si/supervised/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def average(values):
class Ensemble(Model):

def __init__(self, models, score, fvote=majority, fitted=False):
"""Model Ensemble
"""Bagging Model Ensemble
Args:
models (list[Model]): a list of models.
Expand Down
13 changes: 12 additions & 1 deletion src/si/supervised/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,24 @@
"""k-nearest neighbors module"""
# ---------------------------------------------------------------------------
from .model import Model
from ..util import l2_distance, accuracy_score
from si.util import l2_distance, accuracy_score
import numpy as np


class KNN(Model):
def __init__(self, num_neighbors:int, classification:bool=True):
"""
k-nearest neighbors algorithm.
“Tell me with whom you associate, and I will tell you who you are.”
― Johann Wolfgang von Goethe
KNN is based on the notion that close data points are more likely to share
a common label.
:param (int) num_neighbors: Number of closest neighbors to consider in the inference.
:param (bool) classification: If a classification or regression task.
Default classification (True).
"""
super(KNN).__init__()
Expand All @@ -36,8 +45,10 @@ def predict(self, x):
neighbors = self.get_neighbors(x)
values = self.dataset.y[neighbors].tolist()
if self.classification:
# for classification we consider as label the modal one.
prediction = max(set(values), key=values.count)
else:
# for regression we consider the average of the k neighbor labels.
prediction = sum(values)/len(values)
return prediction

Expand Down
9 changes: 4 additions & 5 deletions src/si/supervised/linreg.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@

class LinearRegression(Model):

def __init__(self, epochs=1000, lr=0.001, ldb=1, gd=False):
def __init__(self, epochs=1000, lr=0.001, lbd=1, gd=False):
""" Linear regression model.
:param int epochs: Number of epochs for GD.
:param float lr: Learning rate for GD.
:param float ldb: lambda for the regularization.
:param float lbd: lambda for the regularization.
If non positive, L2 regularization is not applied.
:param bool gd: If True uses gradient descent (GD) to train the model\
otherwise uses closed form linear algebra. Default False.
Expand All @@ -27,7 +27,7 @@ def __init__(self, epochs=1000, lr=0.001, ldb=1, gd=False):
self.theta = None
self.epochs = epochs
self.lr = lr
self.lbd=ldb
self.lbd=lbd
self.gd = gd

def fit(self, dataset):
Expand Down Expand Up @@ -155,8 +155,7 @@ def predict(self, x):
return np.dot(self.theta, _x)

def cost(self, X=None, y=None, theta=None):
""" Uses MSE as cost function J
"""
""" Uses MSE as cost function J"""
# uses the trained dataset and weights if not provided.
X = add_intersect(X) if X is not None else self.X
y = y if y is not None else self.y
Expand Down
2 changes: 1 addition & 1 deletion src/si/supervised/logreg.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"""Logistic regression module"""
# ---------------------------------------------------------------------------
from .model import Model
from ..util import sigmoid, add_intersect
from si.util import sigmoid, add_intersect
import numpy as np


Expand Down
1 change: 1 addition & 0 deletions src/si/supervised/nn/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .nn import NN, Dense, Flatten, Dropout
from .activation import *
from .cnn import *
from .optimizers import *
5 changes: 4 additions & 1 deletion src/si/supervised/nn/activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,17 @@ def __call__(self, z):
z = z.reshape(1, -1)
return self.fn(z)

def initialize(self, optimizer):
pass

def forward(self, input_data):
self.input = input_data

# apply the activation function to the input
self.output = self(self.input)
return self.output

def backward(self, output_error, learning_rate):
def backward(self, output_error):
# learning_rate is not used because there is no "learnable" parameters.
# Only passed the error do the previous layer
return np.multiply(self.prime(self.input), output_error)
Expand Down
Loading

0 comments on commit 3fbd99d

Please sign in to comment.