defreshape(tensor, dims_list): shape = get_shape(tensor) dims_prod = [] for dims in dims_list: ifisinstance(dims, int): dims_prod.append(shape[dims]) elifall([isinstance(shape[d], int) for d in dims]): dims_prod.append(np.prod([shape[d] for d in dims])) else: dims_prod.append(tf.prod([shape[d] for d in dims])) tensor = tf.reshape(tensor, dims_prod) return tensor
然后折叠第二个维度就变得特别简单了。
1 2
b = tf.placeholder(tf.float32, [None, 10, 32]) b = reshape(b, [0, [1, 2]])
with tf.variable_scope("scope"): a1 = tf.get_variable(name="a", shape=[]) with tf.variable_scope("scope", reuse=True): a2 = tf.get_variable(name="a", shape=[]) # OK This becomes handy for example when using built-in neural network layers:
features1 = tf.layers.conv2d(image1, filters=32, kernel_size=3) # Use the same convolution weights to process the second image: with tf.variable_scope(tf.get_variable_scope(), reuse=True): features2 = tf.layers.conv2d(image2, filters=32, kernel_size=3)
a = tf.random_uniform([5, 3, 5]) b = tf.random_uniform([5, 1, 6])
# concat a and b and apply nonlinearity tiled_b = tf.tile(b, [1, 3, 1]) c = tf.concat([a, tiled_b], 2) d = tf.layers.dense(c, 10, activation=tf.nn.relu)
pa = tf.layers.dense(a, 10, activation=None) pb = tf.layers.dense(b, 10, activation=None) d = tf.nn.relu(pa + pb)
事实上,这个代码足够通用,并且可以在具有任意形状的张量间应用:
1 2 3 4 5 6 7
defmerge(a, b, units, activation=tf.nn.relu): pa = tf.layers.dense(a, units, activation=None) pb = tf.layers.dense(b, units, activation=None) c = pa + pb if activation isnotNone: c = activation(c) return c
z = -x # z = tf.negative(x) z = x + y # z = tf.add(x, y) z = x - y # z = tf.subtract(x, y) z = x * y # z = tf.mul(x, y) z = x / y # z = tf.div(x, y) z = x // y # z = tf.floordiv(x, y) z = x % y # z = tf.mod(x, y) z = x ** y # z = tf.pow(x, y) z = x @ y # z = tf.matmul(x, y) z = x > y # z = tf.greater(x, y) z = x >= y # z = tf.greater_equal(x, y) z = x < y # z = tf.less(x, y) z = x <= y # z = tf.less_equal(x, y) z = abs(x) # z = tf.abs(x) z = x & y # z = tf.logical_and(x, y) z = x | y # z = tf.logical_or(x, y) z = x ^ y # z = tf.logical_xor(x, y) z = ~x # z = tf.logical_not(x)
你也可以使用这些操作符的增广版本,如 x += y和x **=2同样是合法的。
注意到 python 不允许重载and,or和not等关键字。
TensorFlow 也不允许把张量当成boolean类型使用,因为这个很容易出错:
1 2 3
x = tf.constant(1.) if x: # 这个将会抛出TypeError错误 ...
如果你想要检查这个张量的值的话,你也可以使用tf.cond(x,…),或者使用if x is None去检查这个变量的值。
import numpy as np import tensorflow as tf import uuid
defrelu(inputs): # Define the op in python def_relu(x): return np.maximum(x, 0.)
# Define the op's gradient in python def_relu_grad(x): return np.float32(x > 0)
# An adapter that defines a gradient op compatible with TensorFlow def_relu_grad_op(op, grad): x = op.inputs[0] x_grad = grad * tf.py_func(_relu_grad, [x], tf.float32) return x_grad
# Register the gradient with a unique id grad_name = "MyReluGrad_" + str(uuid.uuid4()) tf.RegisterGradient(grad_name)(_relu_grad_op)
# Override the gradient of the custom op g = tf.get_default_graph() with g.gradient_override_map({"PyFunc": grad_name}): output = tf.py_func(_relu, [inputs], tf.float32) return output
要验证梯度是否正确,可以使用 TensorFlow 的梯度检查器:
1 2 3 4 5 6
x = tf.random_normal([10]) y = relu(x * x)
with tf.Session(): diff = tf.test.compute_gradient_error(x, [10], y, [10]) print(diff)
import io import matplotlib.pyplot as plt import numpy as np import PIL import tensorflow as tf
defvisualize_labeled_images(images, labels, max_outputs=3, name="image"): def_visualize_image(image, label): # Do the actual drawing in python fig = plt.figure(figsize=(3, 3), dpi=80) ax = fig.add_subplot(111) ax.imshow(image[::-1,...]) ax.text(0, 0, str(label), horizontalalignment="left", verticalalignment="top") fig.canvas.draw()
# Write the plot as a memory file. buf = io.BytesIO() data = fig.savefig(buf, format="png") buf.seek(0)
# Read the image and convert to numpy array img = PIL.Image.open(buf) return np.array(img.getdata()).reshape(img.size[0], img.size[1], -1)
def_visualize_images(images, labels): # Only display the given number of examples in the batch outputs = [] for i inrange(max_outputs): output = _visualize_image(images[i], labels[i]) outputs.append(output) return np.array(outputs, dtype=np.uint8)
# Run the python op. figs = tf.py_func(_visualize_images, [images, labels], tf.uint8) return tf.summary.image(name, figs)
请注意,由于摘要通常仅仅偶尔(不是每步)求值一次,因此可以在实践中使用此实现而不必担心效率。
十、多 GPU 和数据并行
如果你使用 C++ 等语言为单个 CPU 核心编写软件,并使其在多个 GPU 上并行运行,则需要从头开始重写软件。 但TensorFlow并非如此。 由于其象征性,TensorFlow 可以隐藏所有这些复杂性,使得无需在多个 CPU 和 GPU 上扩展程序。
让我们以在 CPU 上相加两个向量的简单示例开始:
1 2 3 4 5 6 7 8
import tensorflow as tf
with tf.device(tf.DeviceSpec(device_type="CPU", device_index=0)): a = tf.random_uniform([1000, 100]) b = tf.random_uniform([1000, 100]) c = a + b
tf.Session().run(c)
GPU 上可以做相同的事情:
1 2 3 4
with tf.device(tf.DeviceSpec(device_type="GPU", device_index=0)): a = tf.random_uniform([1000, 100]) b = tf.random_uniform([1000, 100]) c = a + b
split_c = [] for i inrange(2): with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)): split_c.append(split_a[i] + split_b[i])
c = tf.concat(split_c, axis=0)
让我们以更一般的形式重写它,以便我们可以用任何其他操作替换加法:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
defmake_parallel(fn, num_gpus, **kwargs): in_splits = {} for k, v in kwargs.items(): in_splits[k] = tf.split(v, num_gpus)
out_split = [] for i inrange(num_gpus): with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)): with tf.variable_scope(tf.get_variable_scope(), reuse=i > 0): out_split.append(fn(**{k : v[i] for k, v in in_splits.items()}))
使用 TensorFlow 时可能出现的最常见错误,可能是将形状错误的张量传递给操作。 许多 TensorFlow 操作可以操作不同维度和形状的张量。 这在使用 API 时很方便,但在出现问题时可能会导致额外的麻烦。
例如,考虑tf.matmul操作,它可以相乘两个矩阵:
1 2 3
a = tf.random_uniform([2, 3]) b = tf.random_uniform([3, 4]) c = tf.matmul(a, b) # c is a tensor of shape [2, 4]
但同样的函数也可以进行批量矩阵乘法:
1 2 3
a = tf.random_uniform([10, 2, 3]) b = tf.random_uniform([10, 3, 4]) tf.matmul(a, b) # c is a tensor of shape [10, 2, 4]
我们之前在广播部分谈到的另一个例子,是支持广播的加法操作:
1 2 3
a = tf.constant([[1.], [2.]]) b = tf.constant([1., 2.]) c = a + b # c is a tensor of shape [2, 2]
使用tf.assert*操作验证你的张量
减少不必要行为的可能性的一种方法,是使用tf.assert*操作,明确验证中间张量的维度或形状。
1 2 3 4 5 6
a = tf.constant([[1.], [2.]]) b = tf.constant([1., 2.]) check_a = tf.assert_rank(a, 1) # This will raise an InvalidArgumentError exception check_b = tf.assert_rank(b, 1) with tf.control_dependencies([check_a, check_b]): c = a + b # c is a tensor of shape [2, 2]
sess = tf.Session() sess.run(tf.global_variables_initializer()) for i inrange(10000): sess.run(train_op)
print(sess.run(tf.nn.softmax(w)))
我们使用tf.nn.softmax_cross_entropy_with_logits来定义类别分布的熵。然后我们使用 Adam 优化器来找到具有最大熵的权重。如果你通过了信息论课程,你就会知道均匀分布的熵最大。 所以你期望结果是[0.2,0.2,0.2,0.2,0.2]。 但如果你运行这个,你可能会得到意想不到的结果:
defget_shape(tensor): """Returns static shape if available and dynamic shape otherwise.""" static_shape = tensor.shape.as_list() dynamic_shape = tf.unstack(tf.shape(tensor)) dims = [s[1] if s[0] isNoneelse s[0] for s inzip(static_shape, dynamic_shape)] return dims
deflog_prob_from_logits(logits, axis=-1): """Normalize the log-probabilities so that probabilities sum to one.""" return logits - tf.reduce_logsumexp(logits, axis=axis, keep_dims=True)
defbatch_gather(tensor, indices): """Gather in batch from a tensor of arbitrary size. In pseudocode this module will produce the following: output[i] = tf.gather(tensor[i], indices[i]) Args: tensor: Tensor of arbitrary size. indices: Vector of indices. Returns: output: A tensor of gathered values. """ shape = get_shape(tensor) flat_first = tf.reshape(tensor, [shape[0] * shape[1]] + shape[2:]) indices = tf.convert_to_tensor(indices) offset_shape = [shape[0]] + [1] * (indices.shape.ndims - 1) offset = tf.reshape(tf.range(shape[0]) * shape[1], offset_shape) output = tf.gather(flat_first, indices + offset) return output
defrnn_beam_search(update_fn, initial_state, sequence_length, beam_width, begin_token_id, end_token_id, name="rnn"): """Beam-search decoder for recurrent models. Args: update_fn: Function to compute the next state and logits given the current state and ids. initial_state: Recurrent model states. sequence_length: Length of the generated sequence. beam_width: Beam width. begin_token_id: Begin token id. end_token_id: End token id. name: Scope of the variables. Returns: ids: Output indices. logprobs: Output log probabilities probabilities. """ batch_size = initial_state.shape.as_list()[0]
state = tf.tile(tf.expand_dims(initial_state, axis=1), [1, beam_width, 1])
defmerge(tensors, units, activation=tf.nn.relu, name=None, **kwargs): """Merge features with broadcasting support. This operation concatenates multiple features of varying length and applies non-linear transformation to the outcome. Example: a = tf.zeros([m, 1, d1]) b = tf.zeros([1, n, d2]) c = merge([a, b], d3) # shape of c would be [m, n, d3]. Args: tensors: A list of tensor with the same rank. units: Number of units in the projection function. """ with tf.variable_scope(name, default_name="merge"): # Apply linear projection to input tensors. projs = [] for i, tensor inenumerate(tensors): proj = tf.layers.dense( tensor, units, activation=None, name="proj_%d" % i, **kwargs) projs.append(proj)
# Compute sum of tensors. result = projs.pop() for proj in projs: result = result + proj
# Apply nonlinearity. if activation: result = activation(result) return result
defgaussian_kl(q, p=(0., 0.)): """Computes KL divergence between two isotropic Gaussian distributions. To ensure numerical stability, this op uses mu, log(sigma^2) to represent the distribution. If q is not provided, it's assumed to be unit Gaussian. Args: q: A tuple (mu, log(sigma^2)) representing a multi-variatie Gaussian. p: A tuple (mu, log(sigma^2)) representing a multi-variatie Gaussian. Returns: A tensor representing KL(q, p). """ mu1, log_sigma1_sq = q mu2, log_sigma2_sq = p return tf.reduce_sum( 0.5 * (log_sigma2_sq - log_sigma1_sq + tf.exp(log_sigma1_sq - log_sigma2_sq) + tf.square(mu1 - mu2) / tf.exp(log_sigma2_sq) - 1), axis=-1)
defmake_parallel(fn, num_gpus, **kwargs): """Parallelize given model on multiple gpu devices. Args: fn: Arbitrary function that takes a set of input tensors and outputs a single tensor. First dimension of inputs and output tensor are assumed to be batch dimension. num_gpus: Number of GPU devices. **kwargs: Keyword arguments to be passed to the model. Returns: A tensor corresponding to the model output. """ in_splits = {} for k, v in kwargs.items(): in_splits[k] = tf.split(v, num_gpus)
out_split = [] for i inrange(num_gpus): with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)): with tf.variable_scope(tf.get_variable_scope(), reuse=i > 0): out_split.append(fn(**{k : v[i] for k, v in in_splits.items()}))
return tf.concat(out_split, axis=0)
Leaky relu
1 2 3
defleaky_relu(tensor, alpha=0.1): """Computes the leaky rectified linear activation.""" return tf.maximum(tensor, alpha * tensor)
defbatch_normalization(tensor, training=False, epsilon=0.001, momentum=0.9, fused_batch_norm=False, name=None): """Performs batch normalization on given 4-D tensor. The features are assumed to be in NHWC format. Noe that you need to run UPDATE_OPS in order for this function to perform correctly, e.g.: with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): train_op = optimizer.minimize(loss) Based on: https://arxiv.org/abs/1502.03167 """ with tf.variable_scope(name, default_name="batch_normalization"): channels = tensor.shape.as_list()[-1] axes = list(range(tensor.shape.ndims - 1))