-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Yash Kalathiya <[email protected]>
- Loading branch information
1 parent
eb1b991
commit edab4ba
Showing
50 changed files
with
1,571 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
REPOROOT=../../.. | ||
# Use make help, to see the available rules | ||
include $(REPOROOT)/.make.defaults | ||
|
||
setup:: | ||
@# Help: Recursively make $@ all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
clean:: | ||
@# Help: Recursively make $@ all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
build:: | ||
@# Help: Recursively make $@ in subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
venv:: | ||
@# Help: Recursively make $@ in subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
image:: | ||
@# Help: Recursively make $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
set-versions: | ||
@# Help: Recursively $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
publish:: | ||
@# Help: Recursively make $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
test-image:: | ||
@# Help: Recursively make $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
test:: | ||
@# Help: Recursively make $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
test-src:: | ||
@# Help: Recursively make $@ in all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
load-image:: | ||
@# Help: Recursively make $@ in all subdirs | ||
$(MAKE) RULE=$@ .recurse |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
venv/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
test-data/output | ||
output/* | ||
/output/ | ||
data-processing-lib/ | ||
|
||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
|
||
# Distribution / packaging | ||
bin/ | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
.tox/ | ||
htmlcov | ||
.coverage | ||
.cache | ||
nosetests.xml | ||
coverage.xml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
FROM docker.io/python:3.11.9-slim-bullseye | ||
|
||
RUN pip install --upgrade pip | ||
|
||
# Install system dependencies, including libgomp1 | ||
RUN apt-get update && apt-get install -y \ | ||
libgomp1 \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
# install pytest | ||
RUN pip install --no-cache-dir pytest | ||
|
||
# Create a user and use it to run the transform | ||
RUN useradd -ms /bin/bash dpk | ||
USER dpk | ||
WORKDIR /home/dpk | ||
|
||
# Copy and install data processing libraries | ||
# These are expected to be placed in the docker context before this is run (see the make image). | ||
COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/ | ||
RUN cd data-processing-lib-python && pip install --no-cache-dir -e . | ||
|
||
# END OF STEPS destined for a data-prep-kit base image | ||
|
||
COPY --chown=dpk:root src/ src/ | ||
COPY --chown=dpk:root pyproject.toml pyproject.toml | ||
RUN pip install --no-cache-dir -e . | ||
|
||
# copy source data | ||
COPY ./src/legal_removal_transform.py . | ||
COPY ./src/legal_removal_transform_python.py . | ||
COPY ./src/legal_removal_local.py local/ | ||
|
||
# copy test | ||
COPY test/ test/ | ||
COPY test-data/ test-data/ | ||
|
||
# Set environment | ||
ENV PYTHONPATH /home/dpk | ||
|
||
# Put these at the end since they seem to upset the docker cache. | ||
ARG BUILD_DATE | ||
ARG GIT_COMMIT | ||
LABEL build-date=$BUILD_DATE | ||
LABEL git-commit=$GIT_COMMIT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# Define the root of the local git clone for the common rules to be able | ||
# know where they are running from. | ||
REPOROOT=../../../.. | ||
# Include a library of common .transform.* targets which most | ||
# transforms should be able to reuse. However, feel free | ||
# to override/redefine the rules below. | ||
|
||
# $(REPOROOT)/.make.versions file contains the versions | ||
|
||
TRANSFORM_NAME=legal_removal | ||
DOCKER_IMAGE_VERSION=${LEGAL_REMOVAL_VERSION} | ||
|
||
include $(REPOROOT)/transforms/.make.transforms | ||
|
||
venv:: .transforms.python-venv | ||
|
||
test:: .transforms.python-test | ||
|
||
clean:: .transforms.clean | ||
|
||
image:: .transforms.python-image | ||
|
||
test-src:: .transforms.test-src | ||
|
||
setup:: .transforms.setup | ||
|
||
build:: build-dist image | ||
|
||
publish:: publish-dist publish-image | ||
|
||
publish-image:: .transforms.publish-image-python | ||
|
||
setup:: .transforms.setup | ||
|
||
# distribution versions is the same as image version. | ||
set-versions: | ||
$(MAKE) TRANSFORM_PYTHON_VERSION=${FILTER_PYTHON_VERSION} TOML_VERSION=$(DOCKER_IMAGE_VERSION) .transforms.set-versions | ||
|
||
build-dist:: set-versions .defaults.build-dist | ||
|
||
publish-dist:: .defaults.publish-dist | ||
|
||
test-image:: .transforms.python-test-image | ||
|
||
run-cli-sample: .transforms.run-cli-python-sample | ||
|
||
run-local-sample: .transforms.run-local-sample | ||
|
||
run-local-python-sample: .transforms.run-local-python-sample | ||
|
||
#run-s3-ray-sample: .transforms.run-s3-ray-sample | ||
|
||
minio-start: .minio-start | ||
|
||
load-image:: .transforms.load-image |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# License and Copyright Removal | ||
Please see the set of | ||
[transform project conventions](../../../README.md) | ||
for details on general project conventions, transform configuration, | ||
testing and IDE set up. | ||
|
||
## Summary | ||
|
||
This module is designed to detect and remove license and copyright information from code files. It leverages the [ScanCode Toolkit](https://pypi.org/project/scancode-toolkit/) to accurately identify and process licenses and copyrights in various programming languages. | ||
|
||
After detecting license and copyright position code has been stored at same column. Now lines which doesn't contain license or copyright copied to same position. | ||
|
||
## Configuration and command line Options | ||
|
||
The set of dictionary keys holding configuration for values are as follows: | ||
|
||
* --legal_removal_contents_column_name - specifies the column name which holds code content. By default the value is 'contents'. | ||
* --legal_removal_license - specifies the bool value for removing license or not. Default value is True. | ||
* --legal_removal_copyright - specifies the bool value for removing copyright or not. Default value is True. | ||
|
||
## Running | ||
You can run the [legal_removal_local.py](src/legal_removal_local.py) (python-only implementation) or [legal_removal_local_ray.py](ray/src/legal_removal_local_ray.py) (ray-based implementation) to transform the `test1.parquet` file in [test input data](test-data/input) to an `output` directory. The directory will contain both the new annotated `test1.parquet` file and the `metadata.json` file. | ||
|
||
## Running | ||
|
||
### Launched Command Line Options | ||
When running the transform with the Ray launcher (i.e. TransformLauncher), | ||
the following command line arguments are available in addition to | ||
the options provided by the [ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) | ||
and the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). | ||
|
||
### Running the samples | ||
To run the samples, use the following `make` targets | ||
|
||
* `run-cli-sample` - runs src/legal_removal_transform_python.py using command line args | ||
* `run-local-python-sample` - runs src/legal_removal_local_python.py | ||
* `run-local-sample` - runs src/legal_removal_local.py | ||
|
||
These targets will activate the virtual environment and set up any configuration needed. | ||
Use the `-n` option of `make` to see the detail of what is done to run the sample. | ||
|
||
For example, | ||
```shell | ||
make run-cli-sample | ||
... | ||
``` | ||
Then | ||
```shell | ||
ls output | ||
``` | ||
To see results of the transform. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
[project] | ||
name = "dpk_legal_removal_transform_python" | ||
version = "0.2.0.dev6" | ||
requires-python = ">=3.10" | ||
description = "License and Copyright Removal Transform for Python" | ||
license = {text = "Apache-2.0"} | ||
readme = {file = "README.md", content-type = "text/markdown"} | ||
authors = [ | ||
{ name = "Yash kalathiya", email = "[email protected]" }, | ||
] | ||
dependencies = [ | ||
"data-prep-toolkit==0.2.0.dev6", | ||
"scancode-toolkit==32.1.0", | ||
] | ||
|
||
[build-system] | ||
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] | ||
build-backend = "setuptools.build_meta" | ||
|
||
[project.optional-dependencies] | ||
dev = [ | ||
"twine", | ||
"pytest>=7.3.2", | ||
"pytest-dotenv>=0.5.2", | ||
"pytest-env>=1.0.0", | ||
"pre-commit>=3.3.2", | ||
"pytest-cov>=4.1.0", | ||
"pytest-mock>=3.10.0", | ||
"moto==5.0.5", | ||
"markupsafe==2.0.1", | ||
] | ||
|
||
[options] | ||
package_dir = ["src","test"] | ||
|
||
[options.packages.find] | ||
where = ["src/"] | ||
|
||
[tool.pytest.ini_options] | ||
# Currently we use low coverage since we have to run tests separately (see makefile) | ||
#addopts = "--cov --cov-report term-missing --cov-fail-under 25" | ||
markers = ["unit: unit tests", "integration: integration tests"] | ||
|
||
[tool.coverage.run] | ||
include = ["src/*"] |
52 changes: 52 additions & 0 deletions
52
transforms/code/legal_removal/python/src/legal_removal_local.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# (C) Copyright IBM Corp. 2024. | ||
# Licensed under the Apache License, Version 2.0 (the “License”); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an “AS IS” BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
################################################################################ | ||
|
||
import os | ||
|
||
from data_processing.data_access import DataAccessLocal | ||
from legal_removal_transform import ( | ||
LegalRemovalTransform, | ||
COLUMN_KEY, | ||
LICENSE_KEY, | ||
COPYRIGHT_KEY, | ||
) | ||
import pyarrow.parquet as pq | ||
|
||
|
||
# create parameters | ||
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) | ||
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) | ||
local_conf = { | ||
"input_folder": input_folder, | ||
"output_folder": output_folder, | ||
} | ||
|
||
legal_removal_params = { | ||
COLUMN_KEY: 'contents', | ||
COPYRIGHT_KEY: 'true', | ||
LICENSE_KEY: 'true', | ||
} | ||
|
||
if __name__ == "__main__": | ||
# Here we show how to run outside of ray | ||
# Filter transform needs a DataAccess to ready the domain list. | ||
data_access = DataAccessLocal(local_conf) | ||
# Create and configure the transform. | ||
transform = LegalRemovalTransform(legal_removal_params) | ||
# Use the local data access to read a parquet table. | ||
table,_ = data_access.get_table(os.path.join(input_folder, "test1.parquet")) | ||
print(f"input table has {table.num_rows} rows") | ||
# Transform the table | ||
table_list, metadata = transform.transform(table) | ||
|
||
print(f"\noutput table has {table_list[0].num_rows} rows") | ||
print(f"Output metadata : {metadata}") |
53 changes: 53 additions & 0 deletions
53
transforms/code/legal_removal/python/src/legal_removal_local_python.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# (C) Copyright IBM Corp. 2024. | ||
# Licensed under the Apache License, Version 2.0 (the “License”); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an “AS IS” BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
################################################################################ | ||
|
||
import os | ||
import sys | ||
|
||
from data_processing.runtime.pure_python import PythonTransformLauncher | ||
from data_processing.utils import ParamsUtils | ||
from legal_removal_transform import ( | ||
LegalRemovalTransformConfiguration, | ||
column_cli_params, | ||
license_cli_params, | ||
copyright_cli_params, | ||
) | ||
|
||
|
||
# create parameters | ||
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) | ||
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) | ||
local_conf = { | ||
"input_folder": input_folder, | ||
"output_folder": output_folder, | ||
} | ||
code_location = {"github": "github", "commit_hash": "12345", "path": "path"} | ||
legal_removal_params = { | ||
column_cli_params: 'contents', | ||
license_cli_params: 'false', | ||
copyright_cli_params: 'true', | ||
} | ||
params = { | ||
# Data access. Only required parameters are specified | ||
"data_local_config": ParamsUtils.convert_to_ast(local_conf), | ||
# execution info | ||
"runtime_pipeline_id": "pipeline_id", | ||
"runtime_job_id": "job_id", | ||
"runtime_code_location": ParamsUtils.convert_to_ast(code_location), | ||
} | ||
if __name__ == "__main__": | ||
# Set the simulated command line args | ||
sys.argv = ParamsUtils.dict_to_req(d=params | legal_removal_params) | ||
# create launcher | ||
launcher = PythonTransformLauncher(LegalRemovalTransformConfiguration()) | ||
# Launch the ray actor(s) to process the input | ||
launcher.launch() |
Oops, something went wrong.