Problem
I am building a TensorFlow CNN in Python 2.7 to classify 40x40x40 3D images into one of the two categories. The training data is stored in a HDF5 file as a (5000,40,40,40,1) array (5000 training images, 1 color channel). However, when I am training the network, each iteration of a batch of 32 images takes about a minute to complete. Through Activity Monitor I see that there are about 6GB data written to the disk in each iteration. The HDF5 itself is only about 500MB. What is happening?
This is the code I used to load data:
f = h5py.File(file_name, 'r')
images = f.get('key_name')
images = np.array(images)
images = images.astype(np.float32)
images = np.multiply(images, 1.0 / 100.0)
I also tried to directly use a HDF5 object for each iteration, instead of loading all the data into memory at once. But the problem remains:
f = h5py.File(file_name, 'r')
images = np.array(f.get('key_name')[:batch_size])
CNN Structure
The CNN I built has two 3D convolution layer, one flatten layer, one fully connected layer and one output layer. Here is the complete code for the structure:
num_channels = 1
filter_size_conv1 = 5
filter_size_conv2 = 5
num_filters_conv1 = 32
num_filters_conv2 = 64
fc_layer_size = 64
x = tf.placeholder(tf.float32, shape=[None, img_size, img_size, img_size, num_channels], name='x')
y_true = tf.placeholder(tf.float32, shape=[None, num_classes], name='y_true')
y_true_cls = tf.argmax(y_true, dimension=1)
def create_weights(shape):
return tf.Variable(tf.truncated_normal(shape, stddev=0.05))
def create_biases(size):
return tf.Variable(tf.constant(0.05,shape=[size]))
def create_convolutional_layer(input,
num_input_channels,
conv_filter_size,
num_filters):
weights = create_weights(shape=[
conv_filter_size, conv_filter_size, conv_filter_size, num_input_channels, num_filters])
biases = create_biases(num_filters)
layer = tf.nn.conv3d(input=input,
filter=weights,
strides=[1, 1, 1, 1, 1],
padding='SAME')
layer += biases
layer = tf.nn.max_pool3d(input=layer,
ksize=[1, 4, 4, 4, 1],
strides=[1, 4, 4, 4, 1],
padding='SAME')
layer = tf.nn.relu(layer)
return layer
def create_flatten_layer(layer):
layer_shape = layer.get_shape()
num_features = layer_shape[1:5].num_elements()
layer = tf.reshape(layer, [-1, num_features])
return layer
def create_fc_layer(input,
num_inputs,
num_outputs,
use_relu=True):
weights = create_weights(shape=[num_inputs, num_outputs])
biases = create_biases(num_outputs)
layer = tf.matmul(input, weights) + biases
if use_relu:
layer = tf.nn.relu(layer)
return layer
layer_conv1 = create_convolutional_layer(input=x,
num_input_channels=num_channels,
conv_filter_size=filter_size_conv1,
num_filters=num_filters_conv1)
layer_conv2 = create_convolutional_layer(input=layer_conv1,
num_input_channels=num_filters_conv1,
conv_filter_size=filter_size_conv2,
num_filters=num_filters_conv2)
layer_flat = create_flatten_layer(layer_conv2)
layer_fc1 = create_fc_layer(input=layer_flat,
num_inputs=layer_flat.get_shape()[1:5].num_elements(),
num_outputs=fc_layer_size,
use_relu=True)
layer_fc2 = create_fc_layer(input=layer_fc1,
num_inputs=fc_layer_size,
num_outputs=num_classes,
use_relu=False)
Any help would be appreciated. Thank you so much!
I actually found the same problem here. I'm using conv3d and it takes like forever to finish. Probably related to the optimization of conv3d
Related
I created code which making turtlebot 2 following me depend on detecting my face and chose a value of velocity 0.2 m/s.
my issue is the movement of the robot when disappearing my face suddenly which making turtlebot stops suddenly, I need to make decreasing its velocity gradually like this figure enter link description here
my experience not good in ROS'time
I need it starting the count the seconds from zero every it lost my face.
my issue in my code, Once run the code, the time increase continuously whether it is lost my face or not.in this line
v = self.twist.linear.x = (-0.07 * t + 0.2)
my full code:
#!/usr/bin/env python
import rospy
from sensor_msgs.msg import Image
from geometry_msgs.msg import Twist
import cv2, cv_bridge
face_cascade = cv2.CascadeClassifier('/home/redhwan/1/run-webcam/Face-Detect-Demo-by-Ali-master/haarcascade_frontalface_default.xml' )
class Face_detection:
def __init__(self):
self.bridge = cv_bridge.CvBridge()
self.starting_time = rospy.get_rostime().to_sec()
self.save_time = True
self.image_sub = rospy.Subscriber('/usb_cam/image_raw',Image, self.image_callback)
self.cmd_vel_pub = rospy.Publisher('/cmd_vel_mux/input/teleop',Twist, queue_size=1)
self.twist = Twist()
def image_callback(self, msg):
image = self.bridge.imgmsg_to_cv2(msg,desired_encoding='bgr8')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale( gray,scaleFactor=1.1,minNeighbors=5,minSize=(30, 30),flags=cv2.cv2.CASCADE_SCALE_IMAGE)
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
for (x, y, w, h) in faces:
cv2.rectangle(image, (x, y), (x+w, y+h), (0, 255, 0), 2)
self.twist.linear.x = 0.2
self.cmd_vel_pub.publish(self.twist)
cv2.imshow('face ', image)
cv2.waitKey(3)
if(type(faces) == tuple):
if(self.save_time == False):
# self.save_time = False #Condition only the first time
self.starting_time = rospy.get_rostime().to_sec() #save the current time
now = rospy.get_rostime().to_sec()
# self.save_time == False
t = (now - self.starting_time)
print ('t',t)
if t <2.9:
v = self.twist.linear.x = (-0.07 * t + 0.2)
print v
self.cmd_vel_pub.publish(self.twist)
if t >= 2.9:
v = self.twist.linear.x = 0
print v
self.cmd_vel_pub.publish(self.twist)
rospy.init_node('face_detection')
follower = Face_detection()
rospy.spin()
please help me
Thank in advance
If all you need to do it make the movements of turtlebot smoother. You might find that the velocity smoother package will fulfill your needs.
You can install it by running:
sudo apt install ros-kinetic-yocs-velocity-smoother
The node takes raw velocity input and filters it based on acceleration parameters. So you can remap your cmd_velocity_mux output to raw_cmd_vel and remap the smoothed output smooth_cmd_vel to the input going to the turlebot.
I am currently trying to train this RNN network, but seem to be running into weird errors, which I am not able to decode.
The input to my rnn network is digital sampled audio files. As the audio file can be of different length, will the vector of the sampled audio also have different lengths.
The output or the target of the neural network is to recreate a 14 dimensional vector, containing certain information of the audio files. I've already know the target, by manually calculating it, but need to make it work with a neural network.
I am currently using tensorflow as framework.
My network setup looks like this:
def last_relevant(output):
max_length = int(output.get_shape()[1])
relevant = tf.reduce_sum(tf.mul(output, tf.expand_dims(tf.one_hot(length, max_length), -1)), 1)
return relevant
def length(sequence): ##Zero padding to fit the max lenght... Question whether that is a good idea.
used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
length = tf.reduce_sum(used, reduction_indices=1)
length = tf.cast(length, tf.int32)
return length
def cost(output, target):
# Compute cross entropy for each frame.
cross_entropy = target * tf.log(output)
cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
mask = tf.sign(tf.reduce_max(tf.abs(target), reduction_indices=2))
cross_entropy *= mask
# Average over actual sequence lengths.
cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
cross_entropy /= tf.reduce_sum(mask, reduction_indices=1)
return tf.reduce_mean(cross_entropy)
#----------------------------------------------------------------------#
#----------------------------Main--------------------------------------#
### Tensorflow neural network setup
batch_size = None
sequence_length_max = max_length
input_dimension=1
data = tf.placeholder(tf.float32,[batch_size,sequence_length_max,input_dimension])
target = tf.placeholder(tf.float32,[None,14])
num_hidden = 24 ## Hidden layer
cell = tf.nn.rnn_cell.LSTMCell(num_hidden,state_is_tuple=True) ## Long short term memory
output, state = tf.nn.dynamic_rnn(cell, data, dtype=tf.float32,sequence_length = length(data)) ## Creates the Rnn skeleton
last = last_relevant(output)#tf.gather(val, int(val.get_shape()[0]) - 1) ## Appedning as last
weight = tf.Variable(tf.truncated_normal([num_hidden, int(target.get_shape()[1])]))
bias = tf.Variable(tf.constant(0.1, shape=[target.get_shape()[1]]))
prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
cross_entropy = cost(output,target)# How far am I from correct value?
optimizer = tf.train.AdamOptimizer() ## TensorflowOptimizer
minimize = optimizer.minimize(cross_entropy)
mistakes = tf.not_equal(tf.argmax(target, 1), tf.argmax(prediction, 1))
error = tf.reduce_mean(tf.cast(mistakes, tf.float32))
## Training ##
init_op = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init_op)
batch_size = 1000
no_of_batches = int(len(train_data)/batch_size)
epoch = 5000
for i in range(epoch):
ptr = 0
for j in range(no_of_batches):
inp, out = train_data[ptr:ptr+batch_size], train_output[ptr:ptr+batch_size]
ptr+=batch_size
sess.run(minimize,{data: inp, target: out})
print "Epoch - ",str(i)
incorrect = sess.run(error,{data: test_data, target: test_output})
print('Epoch {:2d} error {:3.1f}%'.format(i + 1, 100 * incorrect))
sess.close()
The error seem to be the usage of the function last_relevant, which should take the output, and feed it back.
This is the error message:
TypeError: Expected binary or unicode string, got <function length at 0x7f846594dde8>
Anyway to tell what could be wrong here?
I tried to build your code in my local.
There is a fundamental mistake in the code which is that you call tf.one_hot but what you pass don't really fit with what is expected:
Read documentation here:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.one_hot.md
tf.one_hot(indices, depth, on_value=None, off_value=None, axis=None, dtype=None, name=None)
However, you are passing a function pointer ("length" is a function in your code, I recommend naming your function in a meaningful manner by refraining yourself from using common keywords) instead of the first parameter.
For a wild guide, you can put your indices as first param (instead of my placeholder empty list) and it will be fixed
relevant = tf.reduce_sum(
tf.mul(output, tf.expand_dims(tf.one_hot([], max_length), -1)), 1)
I'm currently trying to train a image classification convolutional neural network. I'm using an architecture similar to that in the TensorFlow tutorial. After training, I can get a quite high training accuracy and a very low cross entropy. But the test accuracy is always only a little bit higher than random guessing. The neural network seems to suffer from overfitting. In the training process, I have applied stochastic gradient descent and droupout to try to avoid overfitting. But it just doesn't seem to work.
Here is part of my code.
batch_image = np.ndarray(shape=(100,9216), dtype='float')
batch_class = np.ndarray(shape=(100,10), dtype='float')
# first convolutinal layer
w_conv1 = weight_variable([5, 5, 3, 64])
b_conv1 = bias_variable([64])
x_image = tf.reshape(x, [-1, 48, 64, 3])
h_conv1 = tf.nn.relu(conv2d(x_image, w_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
norm1 = tf.nn.lrn(tf.to_float(h_pool1, name='ToFloat'), 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
# second convolutional layer
w_conv2 = weight_variable([5, 5, 64, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(norm1, w_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
norm2 = tf.nn.lrn(tf.to_float(h_pool2, name='ToFloat'), 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
# densely connected layer
w_fc1 = weight_variable([12*16*64, 512])
b_fc1 = bias_variable([512])
h_pool2_flat = tf.reshape(norm2, [-1, 12*16*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
#densely connected layer
w_fc2 = weight_variable([512, 256])
b_fc2 = bias_variable([256])
h_fc2 = tf.nn.relu(tf.matmul(h_fc1, w_fc2) + b_fc2)
# dropout
keep_prob = tf.placeholder("float")
h_fc1_drop = tf.nn.dropout(h_fc2, keep_prob)
# readout layer
w_fc3 = weight_variable([256, 10])
b_fc3 = bias_variable([10])
y_prob = tf.nn.softmax(tf.matmul(h_fc1_drop, w_fc3) + b_fc3)
# train and evaluate the model
cross_entropy = -tf.reduce_sum(y_ * tf.log(y_prob + 0.000000001))
train_step = tf.train.GradientDescentOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_prob, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
sess.run(tf.initialize_all_variables())
for i in range(100):
rand_idx = np.random.randint(17778, size=(100))
k = 0
for j in rand_idx:
batch_image[k] = images[j]
batch_class[k] = np.zeros(shape=(10))
batch_class[k, classes[j, 0]] = 1.0
k+=1
train_step.run(feed_dict={x:batch_image, y_:batch_class, keep_prob:0.5})
train_accuracy = accuracy.eval(feed_dict={x:batch_image, y_:batch_class, keep_prob:1.0})
train_ce = cross_entropy.eval(feed_dict={x:batch_image, y_:batch_class, keep_prob:1.0})
I am wondering is there any mistake in my code or do I have to apply any other strategies to get a better test accuracy.
Thank you!
You can try below strategies to avoid overfitting.
shuffle the input data
Use early stopping for the Loss function with some Patience level.
L1 & L2 Regularization
Add Dropout
Batch Normalization.
If pixels are not normalized, dividing the pixels values with 255 also helps.
Perform Image Data Agumentation.
May be hyper parameter tuning grid search.
Hope it helps! Happy Coding.
Thank You!
I'm fitting full convolutional network on some image data for semantic segmentation using Keras. However, I'm having some problems overfitting. I don't have that much data and I want to do data augmentation. However, as I want to do pixel-wise classification, I need any augmentations like flips, rotations, and shifts to apply to both feature images and the label images. Ideally I'd like to use the Keras ImageDataGenerator for on-the-fly transformations. However, as far as I can tell, you cannot do equivalent transformations on both the feature and label data.
Does anyone know if this is the case and if not, does anyone have any ideas? Otherwise, I'll use other tools to create a larger dataset and just feed it in all at once.
Thanks!
Yes you can. Here's an example from Keras's docs. You zip together two generators seeded with the same seeds and the fit_generator them.
https://keras.io/preprocessing/image/
# we create two instances with the same arguments
data_gen_args = dict(featurewise_center=True,
featurewise_std_normalization=True,
rotation_range=90.,
width_shift_range=0.1,
height_shift_range=0.1,
zoom_range=0.2)
image_datagen = ImageDataGenerator(**data_gen_args)
mask_datagen = ImageDataGenerator(**data_gen_args)
# Provide the same seed and keyword arguments to the fit and flow methods seed = 1
image_datagen.fit(images, augment=True, seed=seed)
mask_datagen.fit(masks, augment=True, seed=seed)
image_generator = image_datagen.flow_from_directory(
'data/images',
class_mode=None,
seed=seed)
mask_generator = mask_datagen.flow_from_directory(
'data/masks',
class_mode=None,
seed=seed)
# combine generators into one which yields image and masks
train_generator = zip(image_generator, mask_generator)
model.fit_generator(
train_generator,
samples_per_epoch=2000,
nb_epoch=50)
There are works on extending ImageDataGenerator to be more flexible for exactly these type of cases (see in this issue on Github for examples).
Additionally, as mentioned by Mikael Rousson in the comments, you can easily create your own version of ImageDataGenerator yourself, while leveraging many of its built-in functions to make it easier. Here is an example code I've used for an image denoising problem, where I use random crops + additive noise to generate clean and noisy image pairs on the fly. You could easily modify this to add other types of augmentations. After which, you can use Model.fit_generator to train using these methods.
from keras.preprocessing.image import load_img, img_to_array, list_pictures
def random_crop(image, crop_size):
height, width = image.shape[1:]
dy, dx = crop_size
if width < dx or height < dy:
return None
x = np.random.randint(0, width - dx + 1)
y = np.random.randint(0, height - dy + 1)
return image[:, y:(y+dy), x:(x+dx)]
def image_generator(list_of_files, crop_size, to_grayscale=True, scale=1, shift=0):
while True:
filename = np.random.choice(list_of_files)
try:
img = img_to_array(load_img(filename, to_grayscale))
except:
return
cropped_img = random_crop(img, crop_size)
if cropped_img is None:
continue
yield scale * cropped_img - shift
def corrupted_training_pair(images, sigma):
for img in images:
target = img
if sigma > 0:
source = img + np.random.normal(0, sigma, img.shape)/255.0
else:
source = img
yield (source, target)
def group_by_batch(dataset, batch_size):
while True:
try:
sources, targets = zip(*[next(dataset) for i in xrange(batch_size)])
batch = (np.stack(sources), np.stack(targets))
yield batch
except:
return
def load_dataset(directory, crop_size, sigma, batch_size):
files = list_pictures(directory)
generator = image_generator(files, crop_size, scale=1/255.0, shift=0.5)
generator = corrupted_training_pair(generator, sigma)
generator = group_by_batch(generator, batch_size)
return generator
You can then use the above like so:
train_set = load_dataset('images/train', (patch_height, patch_width), noise_sigma, batch_size)
val_set = load_dataset('images/val', (patch_height, patch_width), noise_sigma, batch_size)
model.fit_generator(train_set, samples_per_epoch=batch_size * 1000, nb_epoch=nb_epoch, validation_data=val_set, nb_val_samples=1000)
Trying to implement the famous Orange/Apple pyramids blending (cv2 Image Pyramids).
Note: Both images shape is 307x307.
However, since the result image is blurred due to clipping values in cv2.subtract and cv2.add (as stated in cv2 vs numpy Matrix Arithmetics), I have used numpy arithmetics instead as suggested in StackOverflow: Reconstructed Image after Laplacian Pyramid Not the same as original image.
I have tested this by performing pyramids on one image and the result image constructed back using pyramids has the same Max,Min,Average pixels values as opposed to using cv2 arithmetics.
However, on pyramids level 7, the result image gets a 'noise' of a red dot and on level 9 the result image gets a lot of green pixels noises. Images of levels 6, 7, 9 - Imgur Album.
Any ideas why would this happen? The pyramid level 9 green noise I would say happened because the image went below 1x1 shape. But what about the red dot on 7 level pyramid?
EDIT : Code Added
numberOfPyramids = 9
# generate Gaussian pyramids for A and B Images
GA = A.copy()
GB = B.copy()
gpA = [GA]
gpB = [GB]
for i in xrange(numberOfPyramids):
GA = cv2.pyrDown(GA)
GB = cv2.pyrDown(GB)
gpA.append(GA)
gpB.append(GB)
# generate Laplacian Pyramids for A and B Images
lpA = [gpA[numberOfPyramids - 1]]
lpB = [gpB[numberOfPyramids - 1]]
for i in xrange(numberOfPyramids - 1, 0, -1):
geA = cv2.pyrUp(gpA[i], dstsize = np.shape(gpA[i-1])[:2])
geB = cv2.pyrUp(gpB[i], dstsize = np.shape(gpB[i-1])[:2])
laplacianA = gpA[i - 1] - geA if i != 1 else cv2.subtract(gpA[i-1], geA)
laplacianB = gpB[i - 1] - geB if i != 1 else cv2.subtract(gpB[i-1], geB)
lpA.append(laplacianA)
lpB.append(laplacianB)
# Now add left and right halves of images in each level
LS = []
for la, lb in zip(lpA, lpB):
_, cols, _ = la.shape
ls = np.hstack((la[:, : cols / 2], lb[:, cols / 2 :]))
LS.append(ls)
# now reconstruct
ls_ = LS[0]
for i in xrange(1, numberOfPyramids):
ls_ = cv2.pyrUp(ls_, dstsize = np.shape(LS[i])[:2])
ls_ = ls_ + LS[i] if i != numberOfPyramids - 1 else cv2.add(ls_, LS[i])
cv2.imshow(namedWindowName, ls_)
cv2.waitKey()
After read the original article about laplacian pyramid, I find I misunderstood this method, we can fully reconstruct the original image without blur, because we use of additional pix information. And It is true that clipping value lead to blurred. Well now we come back to the beginning again :)
So the code you post is still clipping value, I advise you use int16 to save the laplacian pyramid, and not use cv2.subtract. Hope it works.