Skip to content

Commit

Permalink
Changed name of module
Browse files Browse the repository at this point in the history
Signed-off-by: Yash Kalathiya <[email protected]>
  • Loading branch information
ykalathiya committed Jun 25, 2024
1 parent eb1b991 commit edab4ba
Show file tree
Hide file tree
Showing 50 changed files with 1,571 additions and 0 deletions.
46 changes: 46 additions & 0 deletions transforms/code/legal_removal/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/.make.defaults

setup::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

clean::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

build::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse
venv::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse

image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

set-versions:
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

publish::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-src::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse
1 change: 1 addition & 0 deletions transforms/code/legal_removal/python/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
venv/
38 changes: 38 additions & 0 deletions transforms/code/legal_removal/python/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
test-data/output
output/*
/output/
data-processing-lib/


# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class


# Distribution / packaging
bin/
build/
develop-eggs/
dist/
eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
.tox/
htmlcov
.coverage
.cache
nosetests.xml
coverage.xml
45 changes: 45 additions & 0 deletions transforms/code/legal_removal/python/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
FROM docker.io/python:3.11.9-slim-bullseye

RUN pip install --upgrade pip

# Install system dependencies, including libgomp1
RUN apt-get update && apt-get install -y \
libgomp1 \
&& rm -rf /var/lib/apt/lists/*

# install pytest
RUN pip install --no-cache-dir pytest

# Create a user and use it to run the transform
RUN useradd -ms /bin/bash dpk
USER dpk
WORKDIR /home/dpk

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/
RUN cd data-processing-lib-python && pip install --no-cache-dir -e .

# END OF STEPS destined for a data-prep-kit base image

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
RUN pip install --no-cache-dir -e .

# copy source data
COPY ./src/legal_removal_transform.py .
COPY ./src/legal_removal_transform_python.py .
COPY ./src/legal_removal_local.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/

# Set environment
ENV PYTHONPATH /home/dpk

# Put these at the end since they seem to upset the docker cache.
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
55 changes: 55 additions & 0 deletions transforms/code/legal_removal/python/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..
# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.

# $(REPOROOT)/.make.versions file contains the versions

TRANSFORM_NAME=legal_removal
DOCKER_IMAGE_VERSION=${LEGAL_REMOVAL_VERSION}

include $(REPOROOT)/transforms/.make.transforms

venv:: .transforms.python-venv

test:: .transforms.python-test

clean:: .transforms.clean

image:: .transforms.python-image

test-src:: .transforms.test-src

setup:: .transforms.setup

build:: build-dist image

publish:: publish-dist publish-image

publish-image:: .transforms.publish-image-python

setup:: .transforms.setup

# distribution versions is the same as image version.
set-versions:
$(MAKE) TRANSFORM_PYTHON_VERSION=${FILTER_PYTHON_VERSION} TOML_VERSION=$(DOCKER_IMAGE_VERSION) .transforms.set-versions

build-dist:: set-versions .defaults.build-dist

publish-dist:: .defaults.publish-dist

test-image:: .transforms.python-test-image

run-cli-sample: .transforms.run-cli-python-sample

run-local-sample: .transforms.run-local-sample

run-local-python-sample: .transforms.run-local-python-sample

#run-s3-ray-sample: .transforms.run-s3-ray-sample

minio-start: .minio-start

load-image:: .transforms.load-image
51 changes: 51 additions & 0 deletions transforms/code/legal_removal/python/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# License and Copyright Removal
Please see the set of
[transform project conventions](../../../README.md)
for details on general project conventions, transform configuration,
testing and IDE set up.

## Summary

This module is designed to detect and remove license and copyright information from code files. It leverages the [ScanCode Toolkit](https://pypi.org/project/scancode-toolkit/) to accurately identify and process licenses and copyrights in various programming languages.

After detecting license and copyright position code has been stored at same column. Now lines which doesn't contain license or copyright copied to same position.

## Configuration and command line Options

The set of dictionary keys holding configuration for values are as follows:

* --legal_removal_contents_column_name - specifies the column name which holds code content. By default the value is 'contents'.
* --legal_removal_license - specifies the bool value for removing license or not. Default value is True.
* --legal_removal_copyright - specifies the bool value for removing copyright or not. Default value is True.

## Running
You can run the [legal_removal_local.py](src/legal_removal_local.py) (python-only implementation) or [legal_removal_local_ray.py](ray/src/legal_removal_local_ray.py) (ray-based implementation) to transform the `test1.parquet` file in [test input data](test-data/input) to an `output` directory. The directory will contain both the new annotated `test1.parquet` file and the `metadata.json` file.

## Running

### Launched Command Line Options
When running the transform with the Ray launcher (i.e. TransformLauncher),
the following command line arguments are available in addition to
the options provided by the [ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md)
and the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md).

### Running the samples
To run the samples, use the following `make` targets

* `run-cli-sample` - runs src/legal_removal_transform_python.py using command line args
* `run-local-python-sample` - runs src/legal_removal_local_python.py
* `run-local-sample` - runs src/legal_removal_local.py

These targets will activate the virtual environment and set up any configuration needed.
Use the `-n` option of `make` to see the detail of what is done to run the sample.

For example,
```shell
make run-cli-sample
...
```
Then
```shell
ls output
```
To see results of the transform.
45 changes: 45 additions & 0 deletions transforms/code/legal_removal/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
[project]
name = "dpk_legal_removal_transform_python"
version = "0.2.0.dev6"
requires-python = ">=3.10"
description = "License and Copyright Removal Transform for Python"
license = {text = "Apache-2.0"}
readme = {file = "README.md", content-type = "text/markdown"}
authors = [
{ name = "Yash kalathiya", email = "[email protected]" },
]
dependencies = [
"data-prep-toolkit==0.2.0.dev6",
"scancode-toolkit==32.1.0",
]

[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[project.optional-dependencies]
dev = [
"twine",
"pytest>=7.3.2",
"pytest-dotenv>=0.5.2",
"pytest-env>=1.0.0",
"pre-commit>=3.3.2",
"pytest-cov>=4.1.0",
"pytest-mock>=3.10.0",
"moto==5.0.5",
"markupsafe==2.0.1",
]

[options]
package_dir = ["src","test"]

[options.packages.find]
where = ["src/"]

[tool.pytest.ini_options]
# Currently we use low coverage since we have to run tests separately (see makefile)
#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
markers = ["unit: unit tests", "integration: integration tests"]

[tool.coverage.run]
include = ["src/*"]
52 changes: 52 additions & 0 deletions transforms/code/legal_removal/python/src/legal_removal_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# (C) Copyright IBM Corp. 2024.
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################

import os

from data_processing.data_access import DataAccessLocal
from legal_removal_transform import (
LegalRemovalTransform,
COLUMN_KEY,
LICENSE_KEY,
COPYRIGHT_KEY,
)
import pyarrow.parquet as pq


# create parameters
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output"))
local_conf = {
"input_folder": input_folder,
"output_folder": output_folder,
}

legal_removal_params = {
COLUMN_KEY: 'contents',
COPYRIGHT_KEY: 'true',
LICENSE_KEY: 'true',
}

if __name__ == "__main__":
# Here we show how to run outside of ray
# Filter transform needs a DataAccess to ready the domain list.
data_access = DataAccessLocal(local_conf)
# Create and configure the transform.
transform = LegalRemovalTransform(legal_removal_params)
# Use the local data access to read a parquet table.
table,_ = data_access.get_table(os.path.join(input_folder, "test1.parquet"))
print(f"input table has {table.num_rows} rows")
# Transform the table
table_list, metadata = transform.transform(table)

print(f"\noutput table has {table_list[0].num_rows} rows")
print(f"Output metadata : {metadata}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# (C) Copyright IBM Corp. 2024.
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################

import os
import sys

from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils
from legal_removal_transform import (
LegalRemovalTransformConfiguration,
column_cli_params,
license_cli_params,
copyright_cli_params,
)


# create parameters
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
local_conf = {
"input_folder": input_folder,
"output_folder": output_folder,
}
code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
legal_removal_params = {
column_cli_params: 'contents',
license_cli_params: 'false',
copyright_cli_params: 'true',
}
params = {
# Data access. Only required parameters are specified
"data_local_config": ParamsUtils.convert_to_ast(local_conf),
# execution info
"runtime_pipeline_id": "pipeline_id",
"runtime_job_id": "job_id",
"runtime_code_location": ParamsUtils.convert_to_ast(code_location),
}
if __name__ == "__main__":
# Set the simulated command line args
sys.argv = ParamsUtils.dict_to_req(d=params | legal_removal_params)
# create launcher
launcher = PythonTransformLauncher(LegalRemovalTransformConfiguration())
# Launch the ray actor(s) to process the input
launcher.launch()
Loading

0 comments on commit edab4ba

Please sign in to comment.