Skip to content

Commit

Permalink
hardcoded values removed from basic_cleaning; readme updated
Browse files Browse the repository at this point in the history
  • Loading branch information
VineetKT committed Jul 18, 2021
1 parent bfd25f0 commit e424f62
Show file tree
Hide file tree
Showing 8 changed files with 32 additions and 622 deletions.
566 changes: 16 additions & 550 deletions README.md

Large diffs are not rendered by default.

17 changes: 0 additions & 17 deletions SUBMISSION.md

This file was deleted.

3 changes: 1 addition & 2 deletions components/test_regression_model/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
from wandb_utils.log_artifact import log_artifact


logging.basicConfig(filename='/Users/vineetkumar/Documents/udacity_ml_devops/project 2/nd0821-c2-build-model-workflow-starter/logs/test_model.log',
level=logging.INFO,
logging.basicConfig(level=logging.INFO,
format="%(asctime)-15s %(message)s")
logger = logging.getLogger()

Expand Down
4 changes: 4 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ etl:
sample: "sample1.csv"
min_price: 10 # dollars
max_price: 350 # dollars
min_longitude: -74.25
max_longitude: -73.50
min_latitude: 40.5
max_latitude: 41.2
data_check:
kl_threshold: 0.2
modeling:
Expand Down
21 changes: 5 additions & 16 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,7 @@ def go(config: DictConfig):
)

if "basic_cleaning" in active_steps:
##################
# Implement here #
##################
# performing the basic data cleaning and preprocessing steps
_ = mlflow.run(
uri=os.path.join(hydra.utils.get_original_cwd(),
'src',
Expand All @@ -68,9 +66,7 @@ def go(config: DictConfig):
)

if "data_check" in active_steps:
##################
# Implement here #
##################
# performing the data validation checks
_ = mlflow.run(
uri=os.path.join(hydra.utils.get_original_cwd(),
'src',
Expand All @@ -86,9 +82,7 @@ def go(config: DictConfig):
)

if "data_split" in active_steps:
##################
# Implement here #
##################
# Splitting the data into trainval, and test set
_ = mlflow.run(
uri=f"{config['main']['components_repository']}/train_val_test_split",
entry_point='main',
Expand All @@ -111,9 +105,7 @@ def go(config: DictConfig):
)

# NOTE: use the rf_config we just created as the rf_config parameter for the train_random_forest
##################
# Implement here #
##################
# Training the random forest regressor model
_ = mlflow.run(
uri=os.path.join(hydra.utils.get_original_cwd(),
'src',
Expand All @@ -131,10 +123,7 @@ def go(config: DictConfig):
)

if "test_regression_model" in active_steps:

##################
# Implement here #
##################
# Test and evaluate the model accuarcy on test set
_ = mlflow.run(
uri=f"{config['main']['components_repository']}/test_regression_model",
entry_point='main',
Expand Down
11 changes: 4 additions & 7 deletions src/basic_cleaning/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
import pandas as pd
import wandb

logging.basicConfig(filename='/Users/vineetkumar/Documents/udacity_ml_devops/project 2/nd0821-c2-build-model-workflow-starter/logs/basic_clean.log',
level=logging.INFO,
logging.basicConfig(level=logging.INFO,
format="%(asctime)-15s %(message)s")
logger = logging.getLogger()

Expand All @@ -24,18 +23,16 @@ def go(args):
artifact_local_path = run.use_artifact(args.input_artifact).file()
logger.info('Input artifact received')

######################
# YOUR CODE HERE #
######################
# read the input csv artifact
df = pd.read_csv(artifact_local_path)

# filter outliers in 'price' column
idx = df['price'].between(args.min_price, args.max_price)
df = df[idx].copy()

# filter outliers in 'longitude' column
idx = df['longitude'].between(-74.25, -73.50) & \
df['latitude'].between(40.5, 41.2)
idx = df['longitude'].between(args.min_longitude, args.max_longitude) & \
df['latitude'].between(args.min_latitude, args.max_latitude)
df = df[idx].copy()

# convert last_review column type from str to datetime
Expand Down
7 changes: 1 addition & 6 deletions src/data_check/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
import pandas as pd
import scipy.stats

logging.basicConfig(filename='/Users/vineetkumar/Documents/udacity_ml_devops/project 2/nd0821-c2-build-model-workflow-starter/logs/data_check.log',
level=logging.INFO,
logging.basicConfig(level=logging.INFO,
format="%(asctime)-15s %(message)s")
logger = logging.getLogger()

Expand Down Expand Up @@ -73,10 +72,6 @@ def test_similar_neigh_distrib(data: pd.DataFrame, ref_data: pd.DataFrame, kl_th
assert scipy.stats.entropy(dist1, dist2, base=2) < kl_threshold


########################################################
# Implement here test_row_count and test_price_range #
########################################################

def test_row_count(data):
"""To validate if the data has reasonable size."""

Expand Down
25 changes: 1 addition & 24 deletions src/train_random_forest/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@
from sklearn.preprocessing import (FunctionTransformer, OneHotEncoder,
OrdinalEncoder)

logging.basicConfig(filename='/Users/vineetkumar/Documents/udacity_ml_devops/project 2/nd0821-c2-build-model-workflow-starter/logs/trainer.log',
level=logging.INFO,
logging.basicConfig(level=logging.INFO,
format="%(asctime)-15s %(message)s")
logger = logging.getLogger()

Expand All @@ -51,12 +50,9 @@ def go(args):
# Fix the random seed for the Random Forest, so we get reproducible results
rf_config['random_state'] = args.random_seed

######################################
# Use run.use_artifact(...).file() to get the train and validation artifact (args.trainval_artifact)
# and save the returned path in train_local_path
# YOUR CODE HERE
trainval_local_path = run.use_artifact(args.trainval_artifact).file()
######################################

X = pd.read_csv(trainval_local_path)
# this removes the column "price" from X and puts it into y
Expand All @@ -77,10 +73,7 @@ def go(args):
# Then fit it to the X_train, y_train data
logger.info("Fitting")

######################################
# Fit the pipeline sk_pipe by calling the .fit method on X_train and y_train
# YOUR CODE HERE
######################################
sk_pipe.fit(X_train, y_train)

# Compute r2 and MAE
Expand All @@ -99,21 +92,10 @@ def go(args):
if os.path.exists("random_forest_dir"):
shutil.rmtree("random_forest_dir")

######################################
# Save the sk_pipe pipeline as a mlflow.sklearn model in the directory "random_forest_dir"
# HINT: use mlflow.sklearn.save_model
# YOUR CODE HERE
######################################
mlflow.sklearn.save_model(sk_pipe, "random_forest_dir")

######################################
# Upload the model we just exported to W&B
# HINT: use wandb.Artifact to create an artifact. Use args.output_artifact as artifact name, "model_export" as
# type, provide a description and add rf_config as metadata. Then, use the .add_dir method of the artifact instance
# you just created to add the "random_forest_dir" directory to the artifact, and finally use
# run.log_artifact to log the artifact to the run
# YOUR CODE HERE
######################################
model_artifact = wandb.Artifact(
name=args.output_artifact,
type="model_export",
Expand All @@ -130,7 +112,6 @@ def go(args):
# Here we save r_squared under the "r2" key
run.summary['r2'] = r_squared
# Now log the variable "mae" under the key "mae".
# YOUR CODE HERE
run.summary['mae'] = mae
######################################

Expand Down Expand Up @@ -175,7 +156,6 @@ def get_inference_pipeline(rf_config, max_tfidf_features):
# Build a pipeline with two steps:
# 1 - A SimpleImputer(strategy="most_frequent") to impute missing values
# 2 - A OneHotEncoder() step to encode the variable
# YOUR CODE HERE
non_ordinal_categorical_preproc = make_pipeline(
SimpleImputer(strategy='most_frequent'),
OneHotEncoder()
Expand Down Expand Up @@ -236,12 +216,9 @@ def get_inference_pipeline(rf_config, max_tfidf_features):
# Create random forest
random_forest = RandomForestRegressor(**rf_config)

######################################
# Create the inference pipeline. The pipeline must have 2 steps: a step called "preprocessor" applying the
# ColumnTransformer instance that we saved in the `preprocessor` variable, and a step called "random_forest"
# with the random forest instance that we just saved in the `random_forest` variable.
# HINT: Use the explicit Pipeline constructor so you can assign the names to the steps, do not use make_pipeline
# YOUR CODE HERE
sk_pipe = Pipeline(
steps=[
('preprocessor', preprocessor),
Expand Down

0 comments on commit e424f62

Please sign in to comment.