Skip to content

Model Definition Summary

brightcoder01 edited this page Dec 15, 2019 · 14 revisions

Model Definition Summary

Use Feature Column

Feature Column Definition

CATEGORICAL_FEATURE_KEYS = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]
NUMERIC_FEATURE_KEYS = [
    "age",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
]
OPTIONAL_NUMERIC_FEATURE_KEYS = [
    "education-num",
]
LABEL_KEY = "label"

def get_feature_columns():
    feature_columns = []
    for numeric_feature_key in NUMERIC_FEATURE_KEYS:
        numeric_feature = tf.feature_column.numeric_column(numeric_feature_key)
        feature_columns.append(numeric_feature)

    for categorical_feature_key in CATEGORICAL_FEATURE_KEYS:
        embedding_feature = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_hash_bucket(categorical_feature_key, hash_bucket_size=64),
            dimension=16
        )
        feature_columns.append(embedding_feature)

    return feature_columns

def get_feature_input_layers():
    feature_input_layers = {}
    for numeric_feature_key in NUMERIC_FEATURE_KEYS:
        feature_input_layers[numeric_feature_key] = tf.keras.Input(
            shape=(1,), name=numeric_feature_key, dtype=tf.float32
        )

    for categorical_feature_key in CATEGORICAL_FEATURE_KEYS:
        feature_input_layers[categorical_feature_key] = tf.keras.Input(
            shape=(1,), name=categorical_feature_key, dtype=tf.string
        )

    return feature_input_layers

Sequential

def custom_model(feature_columns):
    model = tf.keras.Sequential([
        tf.keras.layers.DenseFeatures(feature_columns=feature_columns),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
        ])

    return model

feature_columns = get_feature_columns()
model = custom_model(feature_columns)

Functional

def custom_model(feature_columns, feature_inputs):
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    x = feature_layer(feature_inputs)
    x = tf.keras.layers.Dense(16, activation="relu")(x)
    x = tf.keras.layers.Dense(16, activation="relu")(x)
    y = tf.keras.layers.Dense(1, activation="sigmoid")(x)

    model = tf.keras.Model(inputs=feature_inputs, outputs=y)

    return model

feature_columns = get_feature_columns()
feature_inputs = get_feature_input_layers()
model = custom_model(feature_columns, feature_inputs)

Subclass

class CustomModel(tf.keras.Model):
    def __init__(self, feature_columns):
        super(CustomModel, self).__init__(name="census_model")
        self.dense_features = tf.keras.layers.DenseFeatures(feature_columns)
        self.dense_1 = tf.keras.layers.Dense(16, activation="relu")
        self.dense_2 = tf.keras.layers.Dense(16, activation="relu")
        self.dense_3 = tf.keras.layers.Dense(1, activation="sigmoid")

    def call(self, inputs, training=False):
        x = self.dense_features(inputs)
        x = self.dense_1(x)
        x = self.dense_2(x)
        x = self.dense_3(x)

        return x

feature_columns = get_feature_columns()
model = CustomModel(feature_columns)

Wide and Deep

Integration with SQLFlow

Open Questions

  1. Single shared embedding vs several separate embeddings? Generally the same value in different column from the source data should be treated as different value.
    • How to aggregate multiple categorical inputs into a embedding table? Different input shouldn't be mapped to the same id.
  2. For the feature column input, the instances of a minibatch should be the same size. How to solve the var len input? Such as the clicked item id list.
    • Truncate/Padding in dataset_fn
    • Truncate/Padding in feature column normalizer_fn
  3. For sparse input, use the embedding_column or dense embedding layer?
  4. How to decide the hash_bucket_size for the categorical input?
    • statisical. Calculate the categorical set size, and then multiple it with a constant such as 2, 3 to get the hash bucket size.
  5. Integration with SQLFlow, how to generate the code of the feature column definition and dataset_fn using code_gen
    • TBD