I'm building a custom u-net for a semantic segmentation problem, but i'm seeing a weird behavior in the way that loss and metric are calculated during training, with very significative differences.
Update at the bottom for a minimal reproducible example:
I've read this one (1), and this one (2), another one (3) and yet another one(4), but haven't found a suitable answer.
When training the model, i'm using the same function for loss and for metric, and the results vary wildly.
First example with categorical_cross_entropy (i'm using a very small toy set just to show it):
from tensorflow.python.keras import losses
model.compile(optimizer='adam', loss=losses.categorical_crossentropy,
metrics=[losses.categorical_crossentropy])
And the output i get is:
4/4 [===] - 3s 677ms/step - loss: 4.1023 - categorical_crossentropy: 1.0256
- val_loss: 1.3864 - val_categorical_crossentropy: 1.3864
As you can see, loss and categorical_crossentropy are about 4x.
If i'm using a custom metric, the difference is orders of magnitude:
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.losses import categorical_crossentropy
def dice_cross_loss(y_true, y_pred, epsilon=1e-6, smooth=1):
ce_loss = categorical_crossentropy(y_true, y_pred)
y_true_f = K.flatten(y_true)
y_pred_f = K.flatten(y_pred)
intersection = K.sum(y_true_f * y_pred_f)
dice_coef = (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + epsilon)
return ce_loss - K.log(dice_coef + epsilon)
model.compile(optimizer='adam', loss=dice_cross_loss,
metrics=[dice_cross_loss])
When I run it, it's even worse:
4/4 [===] - 3s 682ms/step - loss: 20.9706 - dice_cross_loss: 5.2428
- val_loss: 4.3681 - val_dice_cross_loss: 4.3681
When using larger examples, the difference between the loss and the loss as metric can be more than tenfold.
When reading (1), I removed ALL regularization layers that can work differently on evaluation. from the model. No dropout, no batchnorm. There is pooling, but that shouldn't be the cause of it.
The fitiing code is unremarkable:
model.fit(x=data_x, y=data_y, batch_size=batch_size, epochs=epochs,
verbose=1, validation_split=0.2, shuffle=True, workers=4)
This is the code of the network:
class CustomUnet(object):
def __init__(self, image_shape=(20, 30, 3), n_class=2, **params):
# read parameters
initial_filters = params.get("initial_filters", 64)
conv_activations = params.get("conv_activations", ReLU())
final_activation = params.get("final_activation", "softmax")
self.name = "CustomUnet"
input_layer = Input(shape=image_shape, name='image_input')
conv1 = self.conv_block(input_layer, nfilters=initial_filters, activation=conv_activations, name="con1")
conv1_out = MaxPooling2D(pool_size=(2, 2))(conv1)
conv2 = self.conv_block(conv1_out, nfilters=initial_filters*2, activation=conv_activations, name="con2")
conv2_out = MaxPooling2D(pool_size=(2, 2))(conv2)
conv3 = self.conv_block(conv2_out, nfilters=initial_filters*4, activation=conv_activations, name="con3")
conv3_out = MaxPooling2D(pool_size=(2, 2))(conv3)
conv4 = self.conv_block(conv3_out, nfilters=initial_filters*8, activation=conv_activations, name="con4")
# number jumps from 4 to 7 because it used to have an extra layer and haven't got to refactor properly.
deconv7 = self.deconv_block(conv4, residual=conv3, nfilters=initial_filters*4, name="decon7",
conv_activations=conv_activations)
deconv8 = self.deconv_block(deconv7, residual=conv2, nfilters=initial_filters*2, name="decon8",
conv_activations=conv_activations)
deconv9 = self.deconv_block(deconv8, residual=conv1, nfilters=initial_filters, name="decon9",
conv_activations=conv_activations)
output_layer = Conv2D(filters=n_class, kernel_size=(1, 1))(deconv9)
model = Model(inputs=input_layer, outputs=output_layer4, name='Unet')
self.model = model
def conv_block(self, input_layer, nfilters, size=3, padding='same', initializer="he_normal", name="none",
activation=ReLU()):
x = Conv2D(filters=nfilters, kernel_size=(size, size), padding=padding, kernel_initializer=initializer)(input_layer)
x = Activation(activation)(x)
x = Conv2D(filters=nfilters, kernel_size=(size, size), padding=padding, kernel_initializer=initializer)(x)
x = Activation(activation)(x)
return x
def deconv_block(self, input_layer, residual, nfilters, size=3, padding='same', strides=(2, 2), name="none",
conv_activations=ReLU()):
y = Conv2DTranspose(nfilters, kernel_size=(size, size), strides=strides, padding=padding)(input_layer)
y = concatenate([y, residual]) #, axis=3)
y = self.conv_block(y, nfilters, activation=conv_activations)
return y
Is this an expected behavior? What am I not understanding about the difference on how the loss and the metric are calculated? Have I messed up something in the code?
Thanks!!
Minimal reproducible example:
from tensorflow.python.keras.layers import Input, Conv2D, Activation
from tensorflow.python.keras.models import Model
import numpy as np
input_data = np.random.rand(100, 300, 300, 3) # 300x300 images
out_data = np.random.randint(0, 2, size=(100, 300, 300, 4)) # 4 classes
def simple_model(image_shape, n_class):
input_layer = Input(shape=image_shape, name='image_input')
x = Conv2D(filters=3, kernel_size=(3, 3), padding="same", kernel_initializer="he_normal")(input_layer)
x = Activation("relu")(x)
x = Conv2D(filters=3, kernel_size=(3, 3), padding="same", kernel_initializer="he_normal")(x)
x = Activation("relu")(x)
x = Conv2D(filters=n_class, kernel_size=(1, 1))(x)
output_layer = Activation("softmax")(x)
model = Model(inputs=input_layer, outputs=output_layer, name='Sample')
return model
sample_model = simple_model(input_data[0].shape, out_data.shape[-1])
sample_model.compile(optimizer='adam', loss="categorical_crossentropy", metrics=["categorical_crossentropy"])
batch_size = 5
steps = input_data.shape[0] // batch_size
epochs = 20
history = sample_model.fit(x=input_data, y=out_data, batch_size=batch_size, epochs=epochs, # , callbacks=callbacks,
verbose=1, validation_split=0.2, workers=1)
And the results I get still have the weirdness:
80/80 [===] - 9s 108ms/step - loss: 14.0259 - categorical_crossentropy: 2.8051 - val_loss: 13.9439 - val_categorical_crossentropy: 2.7885
So loss: 14.0259 - categorical_crossentropy: 2.8051. Now i'm lost...