No GPU Usage apparent in Google Cloud Vm with pytorch already installed and Cuda10

No GPU Usage apparent in Google Cloud Vm with pytorch already installed and Cuda10 - google-cloud-platform

I have been using in my machine a network, that is nothing really special. I wanted to do it faster so I started using google cloud. But I notice something weird that my machine with a GTX 1050 ti was faster than a V100 GPU. This didn't add up so I checked the usage and it seems that even though I put some stress by creating a big network and passing a lot of data to it the gpu by using a simple .cuda() in both the model and the data: there wasn't ussage shown in nvidia-smi command as shown in the image
you can check my code here:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("The device is:",device,torch.cuda.get_device_name(0),"and how many are they",torch.cuda.device_count())
# # We load the training data
Samples , Ocupancy, num_samples, Samples_per_slice = common.load_samples(args.samples_filename)
Samples = Samples * args.scaling_todo
print(Samples_per_slice)
# Divide into Slices
Organize_Positions,Orginezed_Ocupancy, batch_size = common.organize_sample_data(Samples,Ocupancy,num_samples,Samples_per_slice,args.num_batches)
phi = common.MLP(3, 1).cuda()
x_test = torch.from_numpy(Organize_Positions.astype(np.float32)).cuda()
y_test = torch.from_numpy(Orginezed_Ocupancy.astype(np.float32)).cuda()
all_data = common.CustomDataset(x_test, y_test)
#Dive into Slices the data
Slice_data = DataLoader(dataset=all_data, batch_size = batch_size, shuffle=False) # only take batch_size = n/b TODO Don't shuffle
#Chunky_data = DataLoader(dataset=Slice_data, batch_size = chunch_size, shuffle=False)
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(phi.parameters(), lr = 0.0001)
epoch = args.num_epochs
fit_start_time = time.time()
phi.train()
for epoch in range(epoch):
curr_epoch_loss = 0
batch = 0
for x_batch, y_batch in Slice_data:
optimizer.zero_grad()
x_train = x_batch
#print(x_train,batch_size)
y_train = y_batch
y_pred = phi(x_train)
#print(y_pred,x_train)
loss = criterion(y_pred.squeeze(), y_train.squeeze())
curr_epoch_loss += loss
print('Batch {}: train loss: {}'.format(batch, loss.item())) # Backward pass
loss.backward()
optimizer.step() # Optimizes only phi parameters
batch+=1
print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
fit_end_time = time.time()
print("Total time = %f" % (fit_end_time - fit_start_time))
# Save Model
torch.save({'state_dict': phi.state_dict()}, args.model_filename)
and the model here:
class MLP(nn.Module):
def __init__(self, in_dim: int, out_dim: int):
super().__init__()
self.in_dim = in_dim
self.out_dim = out_dim
self.fc1 = nn.Linear(in_dim, 128)
self.fc1_bn = nn.BatchNorm1d(128)
self.fc2 = nn.Linear(128, 256)
self.fc2_bn = nn.BatchNorm1d(256)
self.fc3 = nn.Linear(256, 512)
self.fc3_bn = nn.BatchNorm1d(512)
self.fc4 = nn.Linear(512, 512)
self.fc4_bn = nn.BatchNorm1d(512)
self.fc5 = nn.Linear(512, out_dim,bias=False)
self.relu = nn.LeakyReLU()
def forward(self, x):
x = self.relu(self.fc1_bn(self.fc1(x)))
x = self.relu(self.fc2_bn(self.fc2(x)))# leaky
x = self.relu(self.fc3_bn(self.fc3(x)))
x = self.relu(self.fc4_bn(self.fc4(x)))
x = self.fc5(x)
return x
class CustomDataset(Dataset):
def __init__(self, x_tensor, y_tensor):
self.x = x_tensor
self.y = y_tensor
def __getitem__(self, index):
return (self.x[index], self.y[index])
def __len__(self):
return len(self.x)

Related

How to return calculations to a template for all objects in a class model in Django

I am trying to build a simple form which calculates which machine would run a film width the quickest, the parameters and capabilities of each machine are held in a django model.
The width of the film and how much of it will be entered in the form and the quantity needed. The function should work out which machine(s) can run it, what the max speed is and the average speed over the machines that are capable.
I want to return the values of the calculation and maybe run a for loop and display the values for each machine in a results.html template in a table. I also want to display average times across machines capable of running the widths of film.
I had some success with lists but would like to use a class that I can use in the template and do away with the lists.
Any help with this would be much appreciated as I am getting pretty lost in it!
I have only started on the 'Layflat Tubing' function in the hope that I can get it right and just copy down to the other functions.
from django.views.generic.base import TemplateView
from django.shortcuts import render
import math, datetime
from settings.models import Extruder
class Result:
def __init__(self, ext_no, width, speed=0, ):
self.ext_no = ext_no
self.width = width
self.speed = speed
def __str__(self):
return self.ext_no
extruders = Extruder.objects.all()
class FilmSpeedView(TemplateView):
template_name = 'calculations/film-speed.html'
class BagWeightView(TemplateView):
template_name = 'calculations/bag-weight.html'
class CalculatorsView(TemplateView):
template_name = 'calculations/calculators.html'
def result(request):
film_type=''
film_width=''
measure=''
speeds = [0]
quantity = 0
max_speed = 0
ave_speed = 0
ave_time = 0
max_time = 0
a=[]
b=[]
c=[]
d=[]
e=[]
if request.method=="GET":
film_type = str(request.GET["film_type"])
film_width = int(request.GET["film_width"])
edge_trim = int(request.GET["edge_trim"])
quantity =int(request.GET["quantity"])
measure = str(request.GET["measure"])
if measure == "metric":
film_width = int(film_width)
else:
film_width = film_width * 25.4
if edge_trim is None:
edge_trim = 0
else:
edge_trim = int(edge_trim)
if str(film_type) == 'Layflat Tubing':
film_type = "LFT"
for extruder in extruders:
bur = film_width / extruder.die_size
if film_width < extruder.min_width:
b.append(extruder.name + ' : Film too narrow')
extruder = Result(ext_no = extruder.ext_no, width = 'too narrow')
elif film_width > extruder.max_width:
b.append(extruder.name + ' : Film too wide')
extruder = Result(ext_no = extruder.ext_no, width = 'too wide')
else:
percentage = film_width / extruder.max_width
speed = extruder.max_kgs_hr * percentage
extruder = Result(ext_no = extruder.ext_no, speed = round(extruder.max_kgs_hr * percentage, 2), width = 'ok')
speeds.append(speed)
max_speed = max(speeds)
ave_speed = sum(speeds) / len(speeds)
ave_time = float(quantity) / ave_speed * 60.0
max_time = float(quantity) / max_speed * 60.0
else:
film_type = "Invalid Film Type"
m = a
n = b
o = c
g = str(round(ave_speed, 2)) + 'kg\'s/h'
h = str(datetime.timedelta(minutes=ave_time))
i = str(datetime.timedelta(minutes=30))
j = str(round(max_speed, 2)) + 'kg\'s/h'
k = str(datetime.timedelta(minutes=max_time))
return render(request, 'calculations/result.html', {'a':a, 'b':b, 'c':c, 'd':d, 'e':e, 'g':g, 'h':h, 'i':i, 'j':j, 'k':k, 'm':m, 'n':n, 'o':o, 'bur':bur,})

Is this method of calculating the top-5 accuracy in pytorch correct?

I am trying to validate the findings of a paper by testing it on the same model architecture as well as the same dataset reported by the paper. I have been using the imagenet script provided in the official pytorch repository's examples section to do the same.
class AverageMeter(object):
"""Computes and stores the average and current value
Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262
"""
def init(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def accuracy(output, target, topk=(1,)):
"""Computes the precision#k for the specified values of k"""
maxk = max(topk)
batchsize = target.size(0)
, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].reshape(-1).float().sum(0)
res.append(correctk.mul(100.0 / batch_size))
return res
top1 = AverageMeter()
top5 = AverageMeter()
# switch to evaluate mode
model.eval()
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(test_loader):
# measure data loading time
print(f"Processing {batch_idx+1}/{len(test_loader)}")
inputs, targets = inputs.cuda(), targets.cuda()
inputs, targets = torch.autograd.Variable(inputs, volatile=True), torch.autograd.Variable(targets)
# compute output
outputs = model(inputs)
# measure accuracy and record loss
prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5))
print(prec1,prec5)
top1.update(prec1.item(), inputs.size(0))
top5.update(prec5.item(), inputs.size(0))
print(top1)
print(top5)
However the top 5 error which I am getting by using this script is not matching with the one in the paper. Can anyone tell me what is wrong in this particular snippet?

My neural network takes too much time to train one epoch

I am training a neural network which tries to classify a traffic signs, but it takes too much time to train only one epoch, maybe 30+ mins for just one epoch, I have set the batch size to 64 and the learning rate to be 0.002, the input is 20x20 pixels with 3 channels, and the model summary shows that it is training 173,931 parameters, is that too much or good?
Here is the network architecture
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary
class Network(nn.Module):
def __init__(self):
super(Network,self).__init__()
#Convolutional Layers
self.conv1 = nn.Conv2d(3,16,3,padding=1)
self.conv2 = nn.Conv2d(16,32,3,padding=1)
#Max Pooling Layers
self.pool = nn.MaxPool2d(2,2)
#Linear Fully connected layers
self.fc1 = nn.Linear(32*5*5,200)
self.fc2 = nn.Linear(200,43)
#Dropout
self.dropout = nn.Dropout(p=0.25)
def forward(self,x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1,32*5*5)
x = self.dropout(x)
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x
Here is the optimizer instance
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optim = optim.SGD(model.parameters(),lr = 0.002)
Here is the training code
epochs = 20
valid_loss_min = np.Inf
print("Training the network")
for epoch in range (1,epochs+1):
train_loss = 0
valid_loss = 0
model.train()
for data,target in train_data:
if gpu_available:
data,target = data.cuda(),target.cuda()
optim.zero_grad()
output = model(data)
loss = criterion(output,target)
loss.backward()
optim.step()
train_loss += loss.item()*data.size(0)
#########################
###### Validate #########
model.eval()
for data,target in valid_data:
if gpu_available:
data,target = data.cuda(),target.cuda()
output = model(data)
loss = criterion(output,target)
valid_loss += loss.item()*data.size(0)
train_loss = train_loss/len(train_data.dataset)
valid_loss = train/len(valid_data.dataset)
print("Epoch {}.....Train Loss = {:.6f}....Valid Loss = {:.6f}".format(epoch,train_loss,valid_loss))
if valid_loss < valid_loss_min:
torch.save(model.state_dict(), 'model_traffic.pt')
print("Valid Loss min {:.6f} >>> {:.6f}".format(valid_loss_min, valid_loss))
I am using GPU through google colab

Custom Loss Function becomes zero when backpropagated

I am trying to write my own custom loss function that is based on the false positive and negative rates. I made a dummy code so you can check the first 2 defenitions as well. I added the rest, so you can see how it is implemented. However, still somewhere the gradient turns out to be zero. What is now the step where the gradient turns zero, or how can I check this? Please I would like to know how I can fix this :).
I tried providing you with more information so you can play around as well, but if you miss anything please do let me know!
The gradient stays True during every step. However, still during the training of the model the loss is not updated, therefore the NN does not train.
y = Variable(torch.tensor((0, 0, 0, 1, 1,1), dtype=torch.float), requires_grad = True)
y_pred = Variable(torch.tensor((0.333, 0.2, 0.01, 0.99, 0.49, 0.51), dtype=torch.float), requires_grad = True)
x = Variable(torch.tensor((0, 0, 0, 1, 1,1), dtype=torch.float), requires_grad = True)
x_pred = Variable(torch.tensor((0.55, 0.25, 0.01, 0.99, 0.65, 0.51), dtype=torch.float), requires_grad = True)
def binary_y_pred(y_pred):
y_pred.register_hook(lambda grad: print(grad))
y_pred = y_pred+torch.tensor(0.5, requires_grad=True, dtype=torch.float)
y_pred = y_pred.pow(5) # this is my way working around using torch.where()
y_pred = y_pred.pow(10)
y_pred = y_pred.pow(15)
m = nn.Sigmoid()
y_pred = m(y_pred)
y_pred = y_pred-torch.tensor(0.5, requires_grad=True, dtype=torch.float)
y_pred = y_pred*2
y_pred.register_hook(lambda grad: print(grad))
return y_pred
def confusion_matrix(y_pred, y):
TP = torch.sum(y*y_pred)
TN = torch.sum((1-y)*(1-y_pred))
FP = torch.sum((1-y)*y_pred)
FN = torch.sum(y*(1-y_pred))
k_eps = torch.tensor(1e-12, requires_grad=True, dtype=torch.float)
FN_rate = FN/(TP + FN + k_eps)
FP_rate = FP/(TN + FP + k_eps)
return FN_rate, FP_rate
def dif_rate(FN_rate_y, FN_rate_x):
dif = (FN_rate_y - FN_rate_x).pow(2)
return dif
def custom_loss_function(y_pred, y, x_pred, x):
y_pred = binary_y_pred(y_pred)
FN_rate_y, FP_rate_y = confusion_matrix(y_pred, y)
x_pred= binary_y_pred(x_pred)
FN_rate_x, FP_rate_x = confusion_matrix(x_pred, x)
FN_dif = dif_rate(FN_rate_y, FN_rate_x)
FP_dif = dif_rate(FP_rate_y, FP_rate_x)
cost = FN_dif+FP_dif
return cost
# I added the rest so you can see how it is implemented, but this peace does not fully run well! If you want this part to run as well, I can add more code.
class FeedforwardNeuralNetModel(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super(FeedforwardNeuralNetModel, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(hidden_dim, output_dim)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
out = self.fc1(x)
out = self.relu1(out)
out = self.fc2(out)
out = self.sigmoid(out)
return out
model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=[0.9, 0.99], amsgrad=True)
criterion = torch.nn.BCELoss(weight=None, size_average=None, reduce=None, reduction='mean')
for epoch in range(num_epochs):
train_err = 0
for i, (samples, truths) in enumerate(train_loader):
samples = Variable(samples)
truths = Variable(truths)
optimizer.zero_grad() # Reset gradients
outputs = model(samples) # Do the forward pass
loss2 = criterion(outputs, truths) # Calculate loss
samples_y = Variable(samples_y)
samples_x = Variable(samples_x)
y_pred = model(samples_y)
y = Variable(y, requires_grad=True)
x_pred = model(samples_x)
x= Variable(x, requires_grad=True)
cost = custom_loss_function(y_pred, y, x_pred, x)
loss = loss2*0+cost #checking only if cost works.
loss.backward()
optimizer.step()
train_err += loss.item()
train_loss.append(train_err)
I expect the model to update during training. There is no error message.

With your definitions:TP+FN=y and TN+FP=1-y. Then you'll get FN_rate=1-y_pred and FP_rate=y_pred. Your cost is then FN_rate+FP_rate=1, the gradient of which is 0.
You can check this by hand or using a library for symbolic mathematics (e.g., SymPy):
from sympy import symbols
y, y_pred = symbols("y y_pred")
TP = y * y_pred
TN = (1-y)*(1-y_pred)
FP = (1-y)*y_pred
FN = y*(1-y_pred)
# let's ignore the eps for now
FN_rate = FN/(TP + FN)
FP_rate = FP/(TN + FP)
cost = FN_rate + FP_rate
from sympy import simplify
print(simplify(cost))
# output: 1

Plot in tensorboard is always closes and like a circle

I was trying to plot a loss curve, but is always abnormal (just like a circle, I really don't know how to describe it in English properly), I had found many topics about question like this and just can't solve, my tensorflow version is 0.10.0.
import tensorflow as tf
from tensorflow.core.util.event_pb2 import SessionLog
import os
# initialize variables/model parameters
# define the training loop operations
def inputs():
# read/generate input training data X and expected outputs Y
weight_age = [[84,46],[73,20],[65,52],[70,30],[76,57],[69,25],[63,28],[72,36],[79,57],[75,44],[27,24]
,[89,31],[65,52],[57,23],[59,60],[69,48],[60,34],[79,51],[75,50],[82,34],[59,46],[67,23],
[85,37],[55,40],[63,30]]
blodd_fat_content = [354,190,405,263,451,302,288,385,402,365,209,290,346,
254,395,434,220,374,308,220,311,181,274,303,244]
return tf.to_float(weight_age), tf.to_float(blodd_fat_content)
def inference(X):
# compute inference model over data X and return the result
return tf.matmul(X, W) + b
def loss(X, Y):
# compute loss over training data X and expected outputs Y
Y_predicted = inference(X)
return tf.reduce_sum(tf.squared_difference(Y, Y_predicted))
def train(total_loss):
# train / adjust model parameters according to computed total loss
learning_rate = 1e-7
return tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)
def evaluate(sess, X, Y):
# evaluate the resulting trained model
print (sess.run(inference([[80., 25.]])))
print (sess.run(inference([[60., 25.]])))
g1 = tf.Graph()
with tf.Session(graph=g1) as sess:
W = tf.Variable(tf.zeros([2,1]), name="weights")
b = tf.Variable(0., name="bias")
tf.initialize_all_variables().run()
X, Y = inputs()
print (sess.run(W))
total_loss = loss(X, Y)
train_op = train(total_loss)
tf.scalar_summary("loss", total_loss)
summaries = tf.merge_all_summaries()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
summary_writer = tf.train.SummaryWriter('linear', g1)
summary_writer.add_session_log(session_log= SessionLog(status=SessionLog.START), global_step=1)
# actual training loop
training_steps = 100
tolerance = 100
total_loss_last = 0
initial_step = 0
# Create a saver.
saver = tf.train.Saver()
# verify if we don't have a checkpoint saved already
ckpt = tf.train.get_checkpoint_state(os.path.dirname('my_model'))
if ckpt and ckpt.model_checkpoint_path:
# Restores from checkpoint
saver.restore(sess, ckpt.model_checkpoint_path)
initial_step = int(ckpt.model_checkpoint_path.rsplit('-', 1)[1])
# summary_writer.add_session_log(SessionLog(status=SessionLog.START), global_step=initial_step)
for step in range(initial_step, training_steps):
sess.run([train_op])
if step%20 == 0:
saver.save(sess, 'my-model', global_step=step)
gap = abs(sess.run(total_loss) - total_loss_last)
total_loss_last = sess.run(total_loss)
summary_writer.add_summary(sess.run(summaries), step)
# for debugging and learning purposes, see how the loss gets decremented thru training steps
if step % 10 == 0:
print ("loss: ", sess.run([total_loss]))
print("step: ", step)
if gap < tolerance:
break
# evaluation...
evaluate(sess, X, Y)
coord.request_stop()
coord.join(threads)
saver.save(sess, 'my-model', global_step=training_steps)
summary_writer.flush()
sess.close()

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

No GPU Usage apparent in Google Cloud Vm with pytorch already installed and Cuda10 - google-cloud-platform

Related

How to return calculations to a template for all objects in a class model in Django

Is this method of calculating the top-5 accuracy in pytorch correct?

My neural network takes too much time to train one epoch

Custom Loss Function becomes zero when backpropagated

Plot in tensorboard is always closes and like a circle

Categories

Resources