tensorflow error: InvalidArgumentError: Shape mismatch in tuple component 1. Expected [1], got [5] - python-2.7

I am trying to construct a batch of (wav_file, label) pair.
wav file labels and paths are listed in dev.csv.
below code is not working,
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import tensorflow as tf
FLAGS = tf.app.flags.FLAGS
threads = 1
batch_size = 5
global record_defaults
record_defaults = [['/Users/phoenix/workspace/dataset/data_thchs30/dev/A11_101.wav'], ['8.26'], ['七十 年代 末 我 外出 求学 母亲 叮咛 我 吃饭 要 细嚼慢咽 学习 要 深 钻 细 研']]
def read_record(filename_queue, num_records):
reader = tf.TextLineReader()
key, value = reader.read_up_to(filename_queue, num_records)
wav_filename, duration, transcript = tf.decode_csv(value, record_defaults, field_delim=",")
wav_reader = tf.WholeFileReader()
wav_key, wav_value = wav_reader.read_up_to(tf.train.string_input_producer(wav_filename, shuffle=False, capacity=num_records), num_records)
return [wav_key, transcript] # throw errors
# return [wav_key, wav_value] # works
# return [wav_filename, duration, transcript] # works
data_queue = tf.train.string_input_producer(tf.train.match_filenames_once('dev.csv'), shuffle=False)
batch_data = [read_record(data_queue, batch_size) for _ in range(threads)]
capacity = threads * batch_size
batch_values = tf.train.batch_join(batch_data, batch_size=batch_size, capacity=capacity, enqueue_many=True)
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
sess.run(tf.initialize_local_variables())
coord = tf.train.Coordinator()
print(coord)
threads = tf.train.start_queue_runners(coord=coord)
print("threads num: " + str(threads))
try:
step = 0
while not coord.should_stop():
step += 1
feat = sess.run([batch_values])
print("line:", step, feat)
except tf.errors.OutOfRangeError:
print(' training for 1 epochs, %d steps', step)
finally:
coord.request_stop()
coord.join(threads)
throw errors below, how can I fix it?:
dev.csv content as below:
/Users/phoenix/workspace/dataset/data_thchs30/dev/A11_101.wav,8.26,qi shi nian dai mo wo wai chu qiu xue
/Users/phoenix/workspace/dataset/data_thchs30/dev/A11_119.wav,6.9,chen yun tong shi yao qiu gan bu men ren zhen xue xi

I tried to rewrite your code like this.
This is my observation.
The error is no longer thrown. And the values are returned.
An obvious discrepancy is that the size of the batch for transcript is double that specified. So it is 4 instead of 2. It doubles for some reason. No such problem for the audio binary.
shapes=[tf.TensorShape(()),tf.TensorShape(batch_size,)] is based on an error I saw which mentioned that I have to specify this using TensorShape. I didn't find the documentation of any help but it is mentioned there.
shapes: (Optional.) A list of fully-defined TensorShape objects with the same length as dtypes, or None.
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.DEBUG)
FLAGS = tf.app.flags.FLAGS
threads = 1
batch_size = 2
record_defaults = [['D:/male.wav'], ['8.26'], ['七十 年代 末 我 外出 求学 母亲 叮咛 我 吃饭 要 细嚼慢咽 学习 要 深 钻 细 研']]
def readbatch(data_queue) :
reader = tf.TextLineReader()
_, rows = reader.read_up_to(data_queue, batch_size)
wav_filename, duration, transcript = tf.decode_csv(rows, record_defaults,field_delim=",")
audioreader = tf.WholeFileReader()
_, audio = audioreader.read( tf.train.string_input_producer(wav_filename) )
return [audio,transcript]
data_queue = tf.train.string_input_producer(
tf.train.match_filenames_once('D:/Book1.csv'), shuffle=False)
batch_data = [readbatch(data_queue) for _ in range(threads)]
capacity = threads * batch_size
batch_values = tf.train.batch_join(batch_data, shapes=[tf.TensorShape(()),tf.TensorShape(batch_size,)], capacity=capacity, batch_size=batch_size, enqueue_many=False )
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
sess.run(tf.initialize_local_variables())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
try:
step = 0
while not coord.should_stop():
step += 1
feat = sess.run([batch_values])
audio = feat[0][0]
print ('Size of audio is ' + str(audio.size))
script = feat[0][1]
print ('Size of script is ' + str(script.size))
except tf.errors.OutOfRangeError:
print(' training for 1 epochs, %d steps', step)
finally:
coord.request_stop()
coord.join(threads)
A sample dataset proves that there is an extra pair.
[[array([b'Text2', b'Text1'], dtype=object), array([[b'Translation-1', b'Translation-2'],
[b'Translation-1', b'Translation-2']], dtype=object)]]

Related

Is this method of calculating the top-5 accuracy in pytorch correct?

I am trying to validate the findings of a paper by testing it on the same model architecture as well as the same dataset reported by the paper. I have been using the imagenet script provided in the official pytorch repository's examples section to do the same.
class AverageMeter(object):
"""Computes and stores the average and current value
Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262
"""
def init(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def accuracy(output, target, topk=(1,)):
"""Computes the precision#k for the specified values of k"""
maxk = max(topk)
batchsize = target.size(0)
, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].reshape(-1).float().sum(0)
res.append(correctk.mul(100.0 / batch_size))
return res
top1 = AverageMeter()
top5 = AverageMeter()
# switch to evaluate mode
model.eval()
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(test_loader):
# measure data loading time
print(f"Processing {batch_idx+1}/{len(test_loader)}")
inputs, targets = inputs.cuda(), targets.cuda()
inputs, targets = torch.autograd.Variable(inputs, volatile=True), torch.autograd.Variable(targets)
# compute output
outputs = model(inputs)
# measure accuracy and record loss
prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5))
print(prec1,prec5)
top1.update(prec1.item(), inputs.size(0))
top5.update(prec5.item(), inputs.size(0))
print(top1)
print(top5)
However the top 5 error which I am getting by using this script is not matching with the one in the paper. Can anyone tell me what is wrong in this particular snippet?

No GPU Usage apparent in Google Cloud Vm with pytorch already installed and Cuda10

I have been using in my machine a network, that is nothing really special. I wanted to do it faster so I started using google cloud. But I notice something weird that my machine with a GTX 1050 ti was faster than a V100 GPU. This didn't add up so I checked the usage and it seems that even though I put some stress by creating a big network and passing a lot of data to it the gpu by using a simple .cuda() in both the model and the data: there wasn't ussage shown in nvidia-smi command as shown in the image
you can check my code here:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("The device is:",device,torch.cuda.get_device_name(0),"and how many are they",torch.cuda.device_count())
# # We load the training data
Samples , Ocupancy, num_samples, Samples_per_slice = common.load_samples(args.samples_filename)
Samples = Samples * args.scaling_todo
print(Samples_per_slice)
# Divide into Slices
Organize_Positions,Orginezed_Ocupancy, batch_size = common.organize_sample_data(Samples,Ocupancy,num_samples,Samples_per_slice,args.num_batches)
phi = common.MLP(3, 1).cuda()
x_test = torch.from_numpy(Organize_Positions.astype(np.float32)).cuda()
y_test = torch.from_numpy(Orginezed_Ocupancy.astype(np.float32)).cuda()
all_data = common.CustomDataset(x_test, y_test)
#Dive into Slices the data
Slice_data = DataLoader(dataset=all_data, batch_size = batch_size, shuffle=False) # only take batch_size = n/b TODO Don't shuffle
#Chunky_data = DataLoader(dataset=Slice_data, batch_size = chunch_size, shuffle=False)
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(phi.parameters(), lr = 0.0001)
epoch = args.num_epochs
fit_start_time = time.time()
phi.train()
for epoch in range(epoch):
curr_epoch_loss = 0
batch = 0
for x_batch, y_batch in Slice_data:
optimizer.zero_grad()
x_train = x_batch
#print(x_train,batch_size)
y_train = y_batch
y_pred = phi(x_train)
#print(y_pred,x_train)
loss = criterion(y_pred.squeeze(), y_train.squeeze())
curr_epoch_loss += loss
print('Batch {}: train loss: {}'.format(batch, loss.item())) # Backward pass
loss.backward()
optimizer.step() # Optimizes only phi parameters
batch+=1
print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
fit_end_time = time.time()
print("Total time = %f" % (fit_end_time - fit_start_time))
# Save Model
torch.save({'state_dict': phi.state_dict()}, args.model_filename)
and the model here:
class MLP(nn.Module):
def __init__(self, in_dim: int, out_dim: int):
super().__init__()
self.in_dim = in_dim
self.out_dim = out_dim
self.fc1 = nn.Linear(in_dim, 128)
self.fc1_bn = nn.BatchNorm1d(128)
self.fc2 = nn.Linear(128, 256)
self.fc2_bn = nn.BatchNorm1d(256)
self.fc3 = nn.Linear(256, 512)
self.fc3_bn = nn.BatchNorm1d(512)
self.fc4 = nn.Linear(512, 512)
self.fc4_bn = nn.BatchNorm1d(512)
self.fc5 = nn.Linear(512, out_dim,bias=False)
self.relu = nn.LeakyReLU()
def forward(self, x):
x = self.relu(self.fc1_bn(self.fc1(x)))
x = self.relu(self.fc2_bn(self.fc2(x)))# leaky
x = self.relu(self.fc3_bn(self.fc3(x)))
x = self.relu(self.fc4_bn(self.fc4(x)))
x = self.fc5(x)
return x
class CustomDataset(Dataset):
def __init__(self, x_tensor, y_tensor):
self.x = x_tensor
self.y = y_tensor
def __getitem__(self, index):
return (self.x[index], self.y[index])
def __len__(self):
return len(self.x)

Input query for python code

So I have created this code for my research, but I want to use it for plenty of data files, I do not want to do it manually, which means retyping some lines in my code to use desired file. How to use input command in python (I work with python 2.7 on Windows OS) to use it faster, just by typing name of desired datafile. My code so far:
import iodata as io
import matplotlib.pyplot as plt
import numpy as np
import time
from scipy.signal import welch
from scipy import signal
testInstance = io.InputConverter()
start = time.time()
conversionError = io.ConversionError()
#data = testInstance.convert(r"S:\Doktorat\Python\", 1", conversionError)
data = testInstance.convert(r"/Users/PycharmProjects/Hugo/20160401", "201604010000", conversionError)
end = time.time()
print("time elapsed " + str(end - start))
if(conversionError.conversionSucces):
print("Conversion succesful")
if(conversionError.conversionSucces == False):
print("Conversion failed: " + conversionError.conversionErrorLog)
print "Done!"
# Create a new subplot for two cannals 1 & 3
a = np.amin(data.data)
Bx = data.data[0,]
By = data.data[1,]
dt = float(300)/266350
Fs = 1/dt
t = np.arange(0,300,dt*1e3)
N = len(Bx)
M = len(By)
time = np.linspace(0,300,N)
time2 = np.linspace(0,300,M)
filename = 'C:/Users/PycharmProjects/Hugo/20160401/201604010000.dat'
d = open(filename,'rb')
degree = u"\u00b0"
headersize = 64
header = d.read(headersize)
ax1 = plt.subplot(211)
ax1.set_title(header[:16] + ', ' + # station name
'Canals: '+header[32:33]+' and '+header[34:35]+ ', ' # canals
+'Temp'+header[38:43]+degree+'C' # temperature
+', '+'Time:'+header[26:32]+', '+'Date'+' '+header[16:26]) # date
plt.ylabel('Pico Tesle [pT]')
plt.xlabel('Time [ms]')
plt.grid()
plt.plot(time[51:-14], Bx[51:-14], label='Canal 1', color='r', linewidth=0.1, linestyle="-")
plt.plot(time2[1:-14], By[1:-14], label='Canal 3', color='b', linewidth=0.1, linestyle="-")
plt.legend(loc='upper right', frameon=False, )
# Create a new subplot for FFT
plt.subplot(212)
plt.title('Fast Fourier Transform')
plt.ylabel('Power [a.u.]')
plt.xlabel('Frequency Hz')
xaxis2 = np.arange(0,470,10)
plt.xticks(xaxis2)
fft1 = (Bx[51:-14])
fft2 = (By[1:-14])
plt.grid()
# Loop for FFT data
for dataset in [fft1]:
dataset = np.asarray(dataset)
freqs, psd = welch(dataset, fs=266336/300, window='hamming', nperseg=8192)
plt.semilogy(freqs, psd/dataset.size**0, color='r')
for dataset2 in [fft2]:
dataset2 = np.asarray(dataset2)
freqs2, psd2 = welch(dataset2, fs=266336/300, window='hamming', nperseg=8192)
plt.semilogy(freqs2, psd2/dataset2.size**0, color='b')
plt.show()
As you can see there are some places where it would be better to put input and when I run the code I can write names of filenames etc. to python instead of creating every single pythonfile, with specified info in the code.
Btw. I use Pycharm to my python.
If all you are trying to do is get rid of the hardcoded pathname, you should be able to format your name string with input variables
name = raw_input("Name: ")
measurement = raw_input("Measurement: ")
filename = "C:/Users/PycharmProjects/{0}/{1}".format(name, measurement)
see raw_input and string formatting

ValueError: Tensor Tensor("Const:0", shape=(), dtype=float32) may not be fed with tf.placeholder

I'm trying to make speech recognition system with tensorflow.
Input data is an numpy array of size 50000 X 1.
Output data (mapping data) is an numpy array of size 400 X 1.
Input and mapping data is passed in batches of 2 in a list.
I've used this tutorial to design the neural network. Following is the code snippet:
For RNN:
input_data = tf.placeholder(tf.float32, [batch_size, sound_constants.MAX_ROW_SIZE_IN_DATA, sound_constants.MAX_COLUMN_SIZE_IN_DATA], name="train_input")
target = tf.placeholder(tf.float32, [batch_size, sound_constants.MAX_ROW_SIZE_IN_TXT, sound_constants.MAX_COLUMN_SIZE_IN_TXT], name="train_output")
fwd_cell = tf.nn.rnn_cell.BasicLSTMCell(num_hidden, state_is_tuple=True, forget_bias=1.0)
# creating one backward cell
bkwd_cell = tf.nn.rnn_cell.BasicLSTMCell(num_hidden, state_is_tuple=True, forget_bias=1.0)
# creating bidirectional RNN
val, _, _ = tf.nn.static_bidirectional_rnn(fwd_cell, bkwd_cell, tf.unstack(input_data), dtype=tf.float32)
For feeding data:
feed = {g['input_data'] : trb[0], g['target'] : trb[1], g['dropout'] : 0.6}
accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
accuracy += accuracy_
When I ran the code, I got this error:
Traceback (most recent call last):
File "/home/wolborg/PycharmProjects/speech-to-text-rnn/src/rnn_train_1.py", line 205, in <module>
tr_losses, te_losses = train_network(g)
File "/home/wolborg/PycharmProjects/speech-to-text-rnn/src/rnn_train_1.py", line 177, in train_network
accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
File "/home/wolborg/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 895, in run
run_metadata_ptr)
File "/home/wolborg/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1102, in _run
raise ValueError('Tensor %s may not be fed.' % subfeed_t)
ValueError: Tensor Tensor("Const:0", shape=(), dtype=float32) may not be fed.
Process finished with exit code 1
Earlier, I was facing this issue with tf.sparse_placeholder, then after some browsing, I changed input type to tf.placeholder and made related changes. Now I'm clueless on where I'm making the error.
Please suggest something as how should I feed data.
Entire code:
import tensorflow as tf
# for taking MFCC and label input
import numpy as np
import rnn_input_data_1
import sound_constants
# input constants
# Training Parameters
num_input = 10 # mfcc data input
training_data_size = 8 # determines number of files in training and testing module
testing_data_size = num_input - training_data_size
# Network Parameters
learning_rate = 0.0001 # for large training set, it can be set 0.001
num_hidden = 200 # number of hidden layers
num_classes = 28 # total alphabet classes (a-z) + extra symbols (', ' ')
epoch = 1 # number of iterations
batch_size = 2 # number of batches
mfcc_coeffs, text_data = rnn_input_data_1.mfcc_and_text_encoding()
class DataGenerator:
def __init__(self, data_size):
self.ptr = 0
self.epochs = 0
self.data_size = data_size
def next_batch(self):
self.ptr += batch_size
if self.ptr > self.data_size:
self.epochs += 1
self.ptr = 0
return mfcc_coeffs[self.ptr-batch_size : self.ptr], text_data[self.ptr-batch_size : self.ptr]
def reset_graph():
if 'sess' in globals() and sess:
sess.close()
tf.reset_default_graph()
def struct_network():
print ('Inside struct network !!')
reset_graph()
input_data = tf.placeholder(tf.float32, [batch_size, sound_constants.MAX_ROW_SIZE_IN_DATA, sound_constants.MAX_COLUMN_SIZE_IN_DATA], name="train_input")
target = tf.placeholder(tf.float32, [batch_size, sound_constants.MAX_ROW_SIZE_IN_TXT, sound_constants.MAX_COLUMN_SIZE_IN_TXT], name="train_output")
keep_prob = tf.constant(1.0)
fwd_cell = tf.nn.rnn_cell.BasicLSTMCell(num_hidden, state_is_tuple=True, forget_bias=1.0)
# creating one backward cell
bkwd_cell = tf.nn.rnn_cell.BasicLSTMCell(num_hidden, state_is_tuple=True, forget_bias=1.0)
# creating bidirectional RNN
val, _, _ = tf.nn.static_bidirectional_rnn(fwd_cell, bkwd_cell, tf.unstack(input_data), dtype=tf.float32)
# adding dropouts
val = tf.nn.dropout(val, keep_prob)
val = tf.transpose(val, [1, 0, 2])
last = tf.gather(val, int(val.get_shape()[0]) - 1)
# creating bidirectional RNN
print ('BiRNN created !!')
print ('Last Size: ', last.get_shape())
weight = tf.Variable(tf.truncated_normal([num_hidden * 2, sound_constants.MAX_ROW_SIZE_IN_TXT]))
bias = tf.Variable(tf.constant(0.1, shape=[sound_constants.MAX_ROW_SIZE_IN_TXT]))
# mapping to 28 output classes
logits = tf.matmul(last, weight) + bias
prediction = tf.nn.softmax(logits)
prediction = tf.reshape(prediction, shape = [batch_size, sound_constants.MAX_ROW_SIZE_IN_TXT, sound_constants.MAX_COLUMN_SIZE_IN_TXT])
# getting probability distribution
mat1 = tf.cast(tf.argmax(prediction,1),tf.float32)
correct = tf.equal(prediction, target)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
logits = tf.reshape(logits, shape=[batch_size, sound_constants.MAX_ROW_SIZE_IN_TXT, sound_constants.MAX_COLUMN_SIZE_IN_TXT])
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=target))
train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)
# returning components as dictionary elements
return {'input_data' : input_data,
'target' : target,
'dropout': keep_prob,
'loss': loss,
'ts': train_step,
'preds': prediction,
'accuracy': accuracy
}
def train_network(graph):
# initialize tensorflow session and all variables
# tf_gpu_config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = True)
# tf_gpu_config.gpu_options.allow_growth = True
# with tf.Session(config = tf_gpu_config) as sess:
with tf.Session() as sess:
train_instance = DataGenerator(training_data_size)
test_instance = DataGenerator(testing_data_size)
print ('Training data size: ', train_instance.data_size)
print ('Testing data size: ', test_instance.data_size)
sess.run(tf.global_variables_initializer())
print ('Starting session...')
step, accuracy = 0, 0
tr_losses, te_losses = [], []
current_epoch = 0
while current_epoch < epoch:
step += 1
trb = train_instance.next_batch()
feed = {g['input_data'] : trb[0], g['target'] : trb[1], g['dropout'] : 0.6}
accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
accuracy += accuracy_
if train_instance.epochs > current_epoch:
current_epoch += 1
tr_losses.append(accuracy / step)
step, accuracy = 0, 0
#eval test set
te_epoch = test_instance.epochs
while test_instance.epochs == te_epoch:
step += 1
print ('Testing round ', step)
trc = test_instance.next_batch()
feed = {g['input_data']: trc[0], g['target']: trc[1]}
accuracy_ = sess.run([g['accuracy']], feed_dict=feed)[0]
accuracy += accuracy_
te_losses.append(accuracy / step)
step, accuracy = 0,0
print("Accuracy after epoch", current_epoch, " - tr:", tr_losses[-1], "- te:", te_losses[-1])
return tr_losses, te_losses
g = struct_network()
tr_losses, te_losses = train_network(g)
You defined keep_prob as a tf.constant, but then trying to feed the value into it. Replace keep_prob = tf.constant(1.0) with keep_prob = tf.placeholder(tf.float32,[]) or keep_prob = tf.placeholder_with_default(1.0,[])

Plot in tensorboard is always closes and like a circle

I was trying to plot a loss curve, but is always abnormal (just like a circle, I really don't know how to describe it in English properly), I had found many topics about question like this and just can't solve, my tensorflow version is 0.10.0.
import tensorflow as tf
from tensorflow.core.util.event_pb2 import SessionLog
import os
# initialize variables/model parameters
# define the training loop operations
def inputs():
# read/generate input training data X and expected outputs Y
weight_age = [[84,46],[73,20],[65,52],[70,30],[76,57],[69,25],[63,28],[72,36],[79,57],[75,44],[27,24]
,[89,31],[65,52],[57,23],[59,60],[69,48],[60,34],[79,51],[75,50],[82,34],[59,46],[67,23],
[85,37],[55,40],[63,30]]
blodd_fat_content = [354,190,405,263,451,302,288,385,402,365,209,290,346,
254,395,434,220,374,308,220,311,181,274,303,244]
return tf.to_float(weight_age), tf.to_float(blodd_fat_content)
def inference(X):
# compute inference model over data X and return the result
return tf.matmul(X, W) + b
def loss(X, Y):
# compute loss over training data X and expected outputs Y
Y_predicted = inference(X)
return tf.reduce_sum(tf.squared_difference(Y, Y_predicted))
def train(total_loss):
# train / adjust model parameters according to computed total loss
learning_rate = 1e-7
return tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)
def evaluate(sess, X, Y):
# evaluate the resulting trained model
print (sess.run(inference([[80., 25.]])))
print (sess.run(inference([[60., 25.]])))
g1 = tf.Graph()
with tf.Session(graph=g1) as sess:
W = tf.Variable(tf.zeros([2,1]), name="weights")
b = tf.Variable(0., name="bias")
tf.initialize_all_variables().run()
X, Y = inputs()
print (sess.run(W))
total_loss = loss(X, Y)
train_op = train(total_loss)
tf.scalar_summary("loss", total_loss)
summaries = tf.merge_all_summaries()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
summary_writer = tf.train.SummaryWriter('linear', g1)
summary_writer.add_session_log(session_log= SessionLog(status=SessionLog.START), global_step=1)
# actual training loop
training_steps = 100
tolerance = 100
total_loss_last = 0
initial_step = 0
# Create a saver.
saver = tf.train.Saver()
# verify if we don't have a checkpoint saved already
ckpt = tf.train.get_checkpoint_state(os.path.dirname('my_model'))
if ckpt and ckpt.model_checkpoint_path:
# Restores from checkpoint
saver.restore(sess, ckpt.model_checkpoint_path)
initial_step = int(ckpt.model_checkpoint_path.rsplit('-', 1)[1])
# summary_writer.add_session_log(SessionLog(status=SessionLog.START), global_step=initial_step)
for step in range(initial_step, training_steps):
sess.run([train_op])
if step%20 == 0:
saver.save(sess, 'my-model', global_step=step)
gap = abs(sess.run(total_loss) - total_loss_last)
total_loss_last = sess.run(total_loss)
summary_writer.add_summary(sess.run(summaries), step)
# for debugging and learning purposes, see how the loss gets decremented thru training steps
if step % 10 == 0:
print ("loss: ", sess.run([total_loss]))
print("step: ", step)
if gap < tolerance:
break
# evaluation...
evaluate(sess, X, Y)
coord.request_stop()
coord.join(threads)
saver.save(sess, 'my-model', global_step=training_steps)
summary_writer.flush()
sess.close()