config.yaml

main:
  components_repository: "https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter.git#components"
  # All the intermediate files will be copied to this directory at the end of the run.
  # Set this to null if you are running in prod
  project_name: nyc_airbnb
  experiment_name: development
  steps: all
etl:
  sample: "sample1.csv"
  min_price: 10  # dollars
  max_price: 350  # dollars
data_check:
  kl_threshold: 0.2
modeling:
  # Fraction of data to use for test (the remaining will be used for train and validation)
  test_size: 0.2
  # Fraction of remaining data to use for validation
  val_size: 0.2
  # Fix this for reproducibility, change to have new splits
  random_seed: 42
  # Column to use for stratification (use "none" for no stratification)
  stratify_by: "neighbourhood_group"
  # Maximum number of features to consider for the TFIDF applied to the title of the
  # insertion (the column called "name")
  max_tfidf_features: 5
  # NOTE: you can put here any parameter that is accepted by the constructor of
  # RandomForestRegressor. This is a subsample, but more could be added:
  random_forest:
    n_estimators: 100
    max_depth: 15
    min_samples_split: 4
    min_samples_leaf: 3
    # Here -1 means all available cores
    n_jobs: -1
    criterion: squared_error
    max_features: 0.5
    # DO not change the following
    oob_score: true