I have a code that looks like the following, where I want to minimize a function my_cost with respect to parameters w.
However, when running the code, it appears to me that it is very slow (like 30 times slower) compared to same one implemented without tensorflow (by explicitly defining a function that gives the gradient of the cost).
Am I doing something wrong in the following example code? (maybe I am unnecessarily re-computing the gradients graph each time?)
I am using Python 3, and TensorFlow 2.0.0. Relevant Git
In the following code, I am using a simple dummy cost function just as an example to show the big difference in the runtime.
Code with Tensorflow:
import numpy as np
import tensorflow as tf
import time
class ExampleTF:
def __init__(self, n=100, m=10):
Z = np.random.randn(n, m)
self.Z = tf.convert_to_tensor(Z, dtype=tf.float32)
self.w = tf.Variable(np.ones((m, 1)), dtype=tf.float32)
# =====================================
def cost(self, P):
# This is a simple dummy cost function just as an example
return tf.reduce_sum((self.Z @ self.w) - P)
# =====================================
def optimize_w(self, cost_func, parameters, lr=0.01, iterations=2000):
optimizer = tf.optimizers.Adam(lr)
for _ in range(iterations):
optimizer.minimize(cost_func, var_list=parameters)
# =====================================
def update(self, P):
P = tf.convert_to_tensor(P, dtype=tf.float32)
self.optimize_w(
cost_func = lambda: self.cost(P),
parameters = [self.w]
)
#print("===> cost:", self.cost(P).numpy())
#print("w:", self.w.numpy().reshape(-1)[:10])
# =====================================
n, m = 10000, 100
ex_tf = ExampleTF(n, m)
for _ in range(50):
P = np.random.uniform(size=n).reshape((-1, 1))
start = time.time()
ex_tf.update(P)
elapsed = time.time() - start
print("elapsed time:", elapsed)
Code without Tensorflow (just numpy) :
import numpy as np
import tensorflow as tf
import time
class ExampleNonTF:
def __init__(self, n=100, m=10):
self.Z = np.random.randn(n, m)
self.w = np.ones((m, 1))
# =====================================
def cost(self, P):
# This is a simple dummy cost function just as an example
return np.sum(self.Z @ self.w - P)
# =====================================
def gradient_cost(self, P):
# This is the gradient of the dummy cost function with respect to self.w
return np.sum(self.Z, axis=0).reshape(self.w.shape)
# =====================================
def optimize_w(self, P, lr=0.01, iterations=2000): # This is the ADAM optimizer
avg_grad1 = 0; avg_grad2 = 0
beta1 = 0.9; beta2 = 0.999; eps = 1e-07
for itr in range(iterations):
grad = self.gradient_cost(P)
avg_grad1 = beta1 * avg_grad1 + (1 - beta1) * grad
avg_grad2 = (beta2 * avg_grad2 + (1 - beta2) * (grad ** 2))
avg_grad1_corr = avg_grad1 / (1 - beta1 ** (itr + 1))
avg_grad2_corr = avg_grad2 / (1 - beta2 ** (itr + 1))
self.w = self.w - lr * (avg_grad1_corr / (np.sqrt(avg_grad2_corr) + eps))
# =====================================
def update(self, P):
self.optimize_w(P)
#print("===> cost:", self.cost(P))
#print("w:", self.w.reshape(-1)[:10])
# =====================================
n, m = 10000, 100
ex_nontf = ExampleNonTF(n, m)
for _ in range(50):
P = np.random.uniform(size=n).reshape((-1, 1))
start = time.time()
ex_nontf.update(P)
elapsed = time.time() - start
print("elapsed time:", elapsed)