Quelqu'un peut-il expliquer clairement la différence entre les convolutions 1D, 2D et 3D dans CNN (apprentissage en profondeur) avec des exemples?
Je veux expliquer avec l'image de C3D .
En un mot, la direction convolutionnelle & la forme de la sortie est importante!
↑↑↑↑ Convolutions 1D - Basic ↑↑↑↑
import tensorflow as tf
import numpy as np
sess = tf.Session()
ones_1d = np.ones(5)
weight_1d = np.ones(3)
strides_1d = 1
in_1d = tf.constant(ones_1d, dtype=tf.float32)
filter_1d = tf.constant(weight_1d, dtype=tf.float32)
in_width = int(in_1d.shape[0])
filter_width = int(filter_1d.shape[0])
input_1d = tf.reshape(in_1d, [1, in_width, 1])
kernel_1d = tf.reshape(filter_1d, [filter_width, 1, 1])
output_1d = tf.squeeze(tf.nn.conv1d(input_1d, kernel_1d, strides_1d, padding='SAME'))
print sess.run(output_1d)
↑↑↑↑ Convolutions 2D - Basic ↑↑↑↑
ones_2d = np.ones((5,5))
weight_2d = np.ones((3,3))
strides_2d = [1, 1, 1, 1]
in_2d = tf.constant(ones_2d, dtype=tf.float32)
filter_2d = tf.constant(weight_2d, dtype=tf.float32)
in_width = int(in_2d.shape[0])
in_height = int(in_2d.shape[1])
filter_width = int(filter_2d.shape[0])
filter_height = int(filter_2d.shape[1])
input_2d = tf.reshape(in_2d, [1, in_height, in_width, 1])
kernel_2d = tf.reshape(filter_2d, [filter_height, filter_width, 1, 1])
output_2d = tf.squeeze(tf.nn.conv2d(input_2d, kernel_2d, strides=strides_2d, padding='SAME'))
print sess.run(output_2d)
↑↑↑↑↑ Convolutions 3D - Basic ↑↑↑↑
ones_3d = np.ones((5,5,5))
weight_3d = np.ones((3,3,3))
strides_3d = [1, 1, 1, 1, 1]
in_3d = tf.constant(ones_3d, dtype=tf.float32)
filter_3d = tf.constant(weight_3d, dtype=tf.float32)
in_width = int(in_3d.shape[0])
in_height = int(in_3d.shape[1])
in_depth = int(in_3d.shape[2])
filter_width = int(filter_3d.shape[0])
filter_height = int(filter_3d.shape[1])
filter_depth = int(filter_3d.shape[2])
input_3d = tf.reshape(in_3d, [1, in_depth, in_height, in_depth, 1])
kernel_3d = tf.reshape(filter_3d, [filter_depth, filter_height, filter_width, 1, 1])
output_3d = tf.squeeze(tf.nn.conv3d(input_3d, kernel_3d, strides=strides_3d, padding='SAME'))
print sess.run(output_3d)
↑↑↑↑ Convolutions 2D avec entrée 3D - LeNet, VGG, ..., ↑ ↑↑↑↑
in_channels = 32 # 3 for RGB, 32, 64, 128, ...
ones_3d = np.ones((5,5,in_channels)) # input is 3d, in_channels = 32
# filter must have 3d-shpae with in_channels
weight_3d = np.ones((3,3,in_channels))
strides_2d = [1, 1, 1, 1]
in_3d = tf.constant(ones_3d, dtype=tf.float32)
filter_3d = tf.constant(weight_3d, dtype=tf.float32)
in_width = int(in_3d.shape[0])
in_height = int(in_3d.shape[1])
filter_width = int(filter_3d.shape[0])
filter_height = int(filter_3d.shape[1])
input_3d = tf.reshape(in_3d, [1, in_height, in_width, in_channels])
kernel_3d = tf.reshape(filter_3d, [filter_height, filter_width, in_channels, 1])
output_2d = tf.squeeze(tf.nn.conv2d(input_3d, kernel_3d, strides=strides_2d, padding='SAME'))
print sess.run(output_2d)
in_channels = 32 # 3 for RGB, 32, 64, 128, ...
out_channels = 64 # 128, 256, ...
ones_3d = np.ones((5,5,in_channels)) # input is 3d, in_channels = 32
# filter must have 3d-shpae x number of filters = 4D
weight_4d = np.ones((3,3,in_channels, out_channels))
strides_2d = [1, 1, 1, 1]
in_3d = tf.constant(ones_3d, dtype=tf.float32)
filter_4d = tf.constant(weight_4d, dtype=tf.float32)
in_width = int(in_3d.shape[0])
in_height = int(in_3d.shape[1])
filter_width = int(filter_4d.shape[0])
filter_height = int(filter_4d.shape[1])
input_3d = tf.reshape(in_3d, [1, in_height, in_width, in_channels])
kernel_4d = tf.reshape(filter_4d, [filter_height, filter_width, in_channels, out_channels])
#output stacked shape is 3D = 2D x N matrix
output_3d = tf.nn.conv2d(input_3d, kernel_4d, strides=strides_2d, padding='SAME')
print sess.run(output_3d)
↑↑↑↑ Bonus 1x1 conv dans CNN - GoogLeNet, ..., ↑↑↑↑
in_channels = 32 # 3 for RGB, 32, 64, 128, ...
out_channels = 64 # 128, 256, ...
ones_3d = np.ones((1,1,in_channels)) # input is 3d, in_channels = 32
# filter must have 3d-shpae x number of filters = 4D
weight_4d = np.ones((3,3,in_channels, out_channels))
strides_2d = [1, 1, 1, 1]
in_3d = tf.constant(ones_3d, dtype=tf.float32)
filter_4d = tf.constant(weight_4d, dtype=tf.float32)
in_width = int(in_3d.shape[0])
in_height = int(in_3d.shape[1])
filter_width = int(filter_4d.shape[0])
filter_height = int(filter_4d.shape[1])
input_3d = tf.reshape(in_3d, [1, in_height, in_width, in_channels])
kernel_4d = tf.reshape(filter_4d, [filter_height, filter_width, in_channels, out_channels])
#output stacked shape is 3D = 2D x N matrix
output_3d = tf.nn.conv2d(input_3d, kernel_4d, strides=strides_2d, padding='SAME')
print sess.run(output_3d)
- Lien d'origine: LINK
- L'auteur: Martin Görner
- Twitter: @martin_gorner
- Google +: plus.google.com/+MartinGorne
↑↑↑↑ Convolutions 1D avec entrée 1D ↑↑↑↑
↑↑↑↑ Convolutions 1D avec entrée 2D ↑↑↑↑↑
in_channels = 32 # 3, 32, 64, 128, ...
out_channels = 64 # 3, 32, 64, 128, ...
ones_4d = np.ones((5,5,5,in_channels))
weight_5d = np.ones((3,3,3,in_channels,out_channels))
strides_3d = [1, 1, 1, 1, 1]
in_4d = tf.constant(ones_4d, dtype=tf.float32)
filter_5d = tf.constant(weight_5d, dtype=tf.float32)
in_width = int(in_4d.shape[0])
in_height = int(in_4d.shape[1])
in_depth = int(in_4d.shape[2])
filter_width = int(filter_5d.shape[0])
filter_height = int(filter_5d.shape[1])
filter_depth = int(filter_5d.shape[2])
input_4d = tf.reshape(in_4d, [1, in_depth, in_height, in_depth, in_channels])
kernel_5d = tf.reshape(filter_5d, [filter_depth, filter_height, filter_width, in_channels, out_channels])
output_4d = tf.nn.conv3d(input_4d, kernel_5d, strides=strides_3d, padding='SAME')
print sess.run(output_4d)
sess.close()
CNN 1D, 2D ou 3D fait référence à la direction de convolution plutôt qu'à la dimension d'entrée ou de filtrage.
Pour une entrée de canal, CNN2D est égal à CNN1D = longueur du noyau = longueur de l’entrée. (1 direction conv)