Skip to content

Commit

Permalink
Merge pull request #38 from WenjieDu/dev
Browse files Browse the repository at this point in the history
Enable calc_misssing_rate work with pd.DataFrame
  • Loading branch information
WenjieDu committed Jul 1, 2024
2 parents 0f102c3 + 5c30c10 commit e335422
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 21 deletions.
42 changes: 32 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,17 @@ or install from source code:
```python
import numpy as np
from pygrinder import mcar, mar_logistic, mnar_x, mnar_t

from pygrinder import (
mcar,
mar_logistic,
mnar_x,
mnar_t,
rdo,
seq_missing,
block_missing,
calc_missing_rate
)

# given a time-series dataset with 128 samples, each sample with 10 time steps and 36 data features
ts_dataset = np.random.randn(128, 10, 36)
Expand All @@ -87,11 +97,29 @@ X_with_mar_data = mar_logistic(ts_dataset[:, 0, :], obs_rate=0.1, missing_rate=0

# grind the dataset with MNAR pattern
X_with_mnar_x_data = mnar_x(ts_dataset, offset=0.1)
X_with_mnar_t_data = mnar_t(ts_dataset, cycle=20, pos = 10, scale = 3)
X_with_mnar_t_data = mnar_t(ts_dataset, cycle=20, pos=10, scale=3)

# grind the dataset with RDO pattern
X_with_rdo_data = rdo(ts_dataset, p=0.1)

# grind the dataset with Sequence-Missing pattern
X_with_seq_missing_data = seq_missing(ts_dataset, p=0.1, seq_len=5)

# grind the dataset with Block-Missing pattern
X_with_block_missing_data = block_missing(ts_dataset, factor=0.1, block_width=3, block_len=3)

# calculate the missing rate of the dataset
missing_rate = calc_missing_rate(X_with_mcar_data)
```


## ❖ Citing PyGrinder/PyPOTS
<p align="center">
<a href="https://github.com/WenjieDu/PyPOTS">
<img src="https://pypots.com/figs/pypots_logos/Ecosystem/PyPOTS_Ecosystem_Pipeline.png" width="95%"/>
</a>
</p>

The paper introducing PyPOTS is available [on arXiv](https://arxiv.org/abs/2305.18811),
A short version of it is accepted by the 9th SIGKDD international workshop on Mining and Learning from Time Series ([MiLeTS'23](https://kdd-milets.github.io/milets2023/))).
**Additionally**, PyPOTS has been included as a [PyTorch Ecosystem](https://pytorch.org/ecosystem/) project.
Expand All @@ -102,12 +130,6 @@ please cite it as below and 🌟star this repository to make others notice this
There are scientific research projects using PyPOTS and referencing in their papers.
Here is [an incomplete list of them](https://scholar.google.com/scholar?as_ylo=2022&q=%E2%80%9CPyPOTS%E2%80%9D&hl=en).

<p align="center">
<a href="https://github.com/WenjieDu/PyPOTS">
<img src="https://pypots.com/figs/pypots_logos/Ecosystem/PyPOTS_Ecosystem_Pipeline.png" width="95%"/>
</a>
</p>

``` bibtex
@article{du2023pypots,
title={{PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series}},
Expand All @@ -117,9 +139,9 @@ year={2023},
}
```
or
> Wenjie Du. (2023).
> Wenjie Du.
> PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series.
> arXiv, abs/2305.18811. https://arxiv.org/abs/2305.18811
> arXiv, abs/2305.18811, 2023.

<details>
Expand Down
2 changes: 1 addition & 1 deletion pygrinder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
__version__ = "0.6"
__version__ = "0.6.1"

from .missing_at_random import mar_logistic
from .missing_completely_at_random import mcar, mcar_little_test
Expand Down
32 changes: 30 additions & 2 deletions pygrinder/block_missing/block_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,40 @@ def block_missing(
feature_idx: list = None,
step_idx: list = None,
) -> Union[np.ndarray, torch.Tensor]:
"""Create block missing data.
Parameters
----------
X :
Data vector. If X has any missing values, they should be numpy.nan.
factor :
The actual missing rate of block_missing is hard to be strictly controlled.
Hence, we use ``factor`` to help adjust the final missing rate.
block_len :
The length of the mask block.
block_width :
The width of the mask block.
feature_idx :
The indices of features for missing block to star with.
step_idx :
The indices of steps for a missing block to start with.
Returns
-------
corrupted_X :
Original X with artificial missing values.
Both originally-missing and artificially-missing values are left as NaN.
"""
if isinstance(X, list):
X = np.asarray(X)
n_samples, n_steps, n_features = X.shape

# assert 0 < p <= 1, f"p must be in range (0, 1), but got {p}"

assert isinstance(
block_len, int
), f"`block_len` must be type of int, but got {type(block_len)}"
Expand Down
26 changes: 26 additions & 0 deletions pygrinder/sequential_missing/seq_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,32 @@ def seq_missing(
feature_idx: list = None,
step_idx: list = None,
) -> Union[np.ndarray, torch.Tensor]:
"""Create subsequence missing data.
Parameters
----------
X :
Data vector. If X has any missing values, they should be numpy.nan.
p :
The probability that values may be masked as missing completely at random.
seq_len :
The length of missing sequence.
feature_idx :
The indices of features for missing sequences to be corrupted.
step_idx :
The indices of steps for a missing sequence to start with.
Returns
-------
corrupted_X :
Original X with artificial missing values.
Both originally-missing and artificially-missing values are left as NaN.
"""
if isinstance(X, list):
X = np.asarray(X)
n_samples, n_steps, n_features = X.shape
Expand Down
21 changes: 13 additions & 8 deletions pygrinder/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,42 @@
from typing import Union, Tuple

import numpy as np
import pandas as pd
import torch


def calc_missing_rate(X: Union[np.ndarray, torch.Tensor]) -> float:
def calc_missing_rate(
X: Union[np.ndarray, torch.Tensor, pd.DataFrame],
) -> float:
"""Calculate the originally missing rate of the raw data.
Parameters
----------
X:
Data array that may contain missing values.
Data array/tensor/frame that may contain missing values.
Returns
-------
originally_missing_rate,
missing_rate,
The originally missing rate of the raw data. Its value should be in the range [0,1].
"""
if isinstance(X, list):
X = np.asarray(X)

if isinstance(X, np.ndarray):
originally_missing_rate = np.sum(np.isnan(X)) / np.prod(X.shape)
missing_rate = np.sum(np.isnan(X)) / np.prod(X.shape)
elif isinstance(X, torch.Tensor):
originally_missing_rate = torch.sum(torch.isnan(X)) / np.prod(X.shape)
originally_missing_rate = originally_missing_rate.item()
missing_rate = torch.sum(torch.isnan(X)) / np.prod(X.shape)
missing_rate = missing_rate.item()
elif isinstance(X, pd.DataFrame):
missing_rate = pd.isna(X).sum().sum() / np.prod(X.shape)
else:
raise TypeError(
f"X must be type of list/numpy.ndarray/torch.Tensor, but got {type(X)}"
f"X must be type of list/numpy.ndarray/torch.Tensor/pandas.DataFrame, but got {type(X)}"
)

return originally_missing_rate
return missing_rate


def masked_fill(
Expand Down

0 comments on commit e335422

Please sign in to comment.