In this example, I am trying to create an 10x8 array using values from a 10x9 array. It looks like I am accessing memory incorrectly but I am not sure where my error is.
The code in C++ would be something like
for (int h = 0; h < height; h++){
for (int i = 0; i < (width-2); i++)
dd[h*(width-2)+i] = hi[h*(width-1)+i] + hi[h*(width-1)+i+1];
}
This is what I am trying in CUDA:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <iostream>
#define TILE_WIDTH 4
using namespace std;
__global__ void cudaOffsetArray(int height, int width, float *HI, float *DD){
int x = blockIdx.x * blockDim.x + threadIdx.x; // Col // width
int y = blockIdx.y * blockDim.y + threadIdx.y; // Row // height
int grid_width = gridDim.x * blockDim.x;
//int index = y * grid_width + x;
if ((x < (width - 2)) && (y < (height)))
DD[y * (grid_width - 2) + x] = (HI[y * (grid_width - 1) + x] + HI[y * (grid_width - 1) + x + 1]);
}
int main(){
int height = 10;
int width = 10;
float *HI = new float [height * (width - 1)];
for (int i = 0; i < height; i++){
for (int j = 0; j < (width - 1); j++)
HI[i * (width - 1) + j] = 1;
}
float *gpu_HI;
float *gpu_DD;
cudaMalloc((void **)&gpu_HI, (height * (width - 1) * sizeof(float)));
cudaMalloc((void **)&gpu_DD, (height * (width - 2) * sizeof(float)));
cudaMemcpy(gpu_HI, HI, (height * (width - 1) * sizeof(float)), cudaMemcpyHostToDevice);
dim3 dimGrid((width - 1) / TILE_WIDTH + 1, (height - 1)/TILE_WIDTH + 1, 1);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
cudaOffsetArray<<<dimGrid,dimBlock>>>(height, width, gpu_HI, gpu_DD);
float *result = new float[height * (width - 2)];
cudaMemcpy(result, gpu_DD, (height * (width - 2) * sizeof(float)), cudaMemcpyDeviceToHost);
for (int i = 0; i < height; i++){
for (int j = 0; j < (width - 2); j++)
cout << result[i * (width - 2) + j] << " ";
cout << endl;
}
cudaFree(gpu_HI);
cudaFree(gpu_DD);
delete[] result;
delete[] HI;
system("pause");
}
I've also tried this in the global function:
if ((x < (width - 2)) && (y < (height)))
DD[y * (grid_width - 2) + (blockIdx.x - 2) * blockDim.x + threadIdx.x] =
(HI[y * (grid_width - 1) + (blockIdx.x - 1) * blockDim.x + threadIdx.x] +
HI[y * (grid_width - 1) + (blockIdx.x - 1) * blockDim.x + threadIdx.x + 1]);
To "fix" your code, change each use of grid_width to width in this line in your kernel:
DD[y * (grid_width - 2) + x] = (HI[y * (grid_width - 1) + x] + HI[y * (grid_width - 1) + x + 1]);
Like this:
DD[y * (width - 2) + x] = (HI[y * (width - 1) + x] + HI[y * (width - 1) + x + 1]);
Explanation:
Your grid_width:
dim3 dimGrid((width * 2 - 1) / TILE_WIDTH + 1, (height - 1)/TILE_WIDTH + 1, 1);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
doesn't actually correspond to your array size (10x10, or 10x9, or 10x8). I"m not sure why you're launching 2*width threads in the x dimension, but this means that your thread array is considerably larger than your data array.
So when you use grid_width in the kernel:
DD[y * (grid_width - 2) + x] = (HI[y * (grid_width - 1) + x] + HI[y * (grid_width - 1) + x + 1]);
the indexing will be a problem. If you instead change each instance of grid_width above to just width (which corresponds to the actual width of your data array) you'll get better indexing, I think. Normally it's not an issue to launch "extra threads" because you have a thread check line in your kernel:
if ((x < (width - 2)) && (y < (height)))
but when you launch extra threads, it is making your grid larger, and so you can't use grid dimensions to index properly into your data array.
Related
Based on my study, there are 2 different strategies to implement tiled version of convolution with cuda. I want to know more about this, and would like to see how they compare with each other, what is the advantage and disadvantage of each strategy, and how to choose. Below is the implementations of the two different strategies.
Strategy 1: the tile size matches with the output size, and needs multiple steps to load the input.
#define MASK_WIDTH 3
#define MASK_RADIUS 1
#define TILE_WIDTH 8
#define SHAREDMEM_DIM (TILE_WIDTH + (MASK_RADIUS * 2))
__constant__ float deviceMask[MASK_WIDTH * MASK_WIDTH * MASK_WIDTH];
__global__ void conv3d(float *inputArray,
float *outputArray,
const int z_size,
const int y_size,
const int x_size) {
__shared__ float subTile[SHAREDMEM_DIM][SHAREDMEM_DIM][SHAREDMEM_DIM];
int bx = blockIdx.x, tx = threadIdx.x;
int by = blockIdx.y, ty = threadIdx.y;
int bz = blockIdx.z, tz = threadIdx.z;
int destination = (tz * TILE_WIDTH * TILE_WIDTH) + (ty * TILE_WIDTH) + tx;
int destTmp = destination;
int dX = destTmp % SHAREDMEM_DIM;
destTmp = destTmp / SHAREDMEM_DIM;
int dY = destTmp % SHAREDMEM_DIM;
destTmp = destTmp / SHAREDMEM_DIM;
int dZ = destTmp;
int inputZ = dZ + (bz * TILE_WIDTH) - MASK_RADIUS;
int inputY = dY + (by * TILE_WIDTH) - MASK_RADIUS;
int inputX = dX + (bx * TILE_WIDTH) - MASK_RADIUS;
int input = (inputZ * y_size * x_size) + (inputY * x_size) + inputX;
if( inputZ >= 0 && inputZ < z_size
&& inputY >= 0 && inputY < y_size
&& inputX >= 0 && inputX < x_size){
subTile[dZ][dY][dX] = inputArray[input];
}
else{
subTile[dZ][dY][dX] = 0;
}
destination = TILE_WIDTH * TILE_WIDTH * TILE_WIDTH
+ (tz * TILE_WIDTH * TILE_WIDTH) + (ty * TILE_WIDTH) + tx;
destTmp = destination;
dX = destTmp % SHAREDMEM_DIM;
destTmp = destTmp / SHAREDMEM_DIM;
dY = destTmp % SHAREDMEM_DIM;
destTmp = destTmp / SHAREDMEM_DIM;
dZ = destTmp;
inputZ = dZ + (bz * TILE_WIDTH) - MASK_RADIUS;
inputY = dY + (by * TILE_WIDTH) - MASK_RADIUS;
inputX = dX + (bx * TILE_WIDTH) - MASK_RADIUS;
input = (inputZ * y_size * x_size) + (inputY * x_size) + inputX;
if(dZ < SHAREDMEM_DIM){
if( inputZ >= 0 && inputZ < z_size
&& inputY >= 0 && inputY < y_size
&& inputX >= 0 && inputX < x_size ) {
subTile[dZ][dY][dX] = inputArray[input];
}
else{
subTile[dZ][dY][dX] = 0;
}
}
__syncthreads();
float sum = 0;
int z, y, x;
for(z = 0; z < MASK_WIDTH; z++){
for(y = 0; y < MASK_WIDTH; y++){
for(x = 0; x < MASK_WIDTH; x++){
sum += subTile[tz + z][ty + y][tx + x]
* deviceMask[x + (y * MASK_WIDTH) + (z * MASK_WIDTH * MASK_WIDTH)];
}
}
}
z = tz + (bz * TILE_WIDTH);
y = ty + (by * TILE_WIDTH);
x = tx + (bx * TILE_WIDTH);
if(z < z_size && y < y_size && x < x_size){
outputArray[x + (y * x_size) + (z * y_size * x_size)] = sum;
}
__syncthreads();
}
The second strategy is to set the block size to be the same with input tile. In calculating output, some threads are turned off.
#define TILE_X 14
#define TILE_Y 6
#define TILE_Z 6
#define MASK_WIDTH 3
#define MASK_SIZE MASK_WIDTH * MASK_WIDTH * MASK_WIDTH
__constant__ float mask[MASK_WIDTH][MASK_WIDTH][MASK_WIDTH];
__global__ void conv3d(float *input, float *output, const int z_size, const int y_size, const int x_size) {
__shared__ float inputTile [TILE_Z+MASK_WIDTH-1][TILE_Y+MASK_WIDTH-1][TILE_X+MASK_WIDTH-1];
int tx = threadIdx.x; int ty = threadIdx.y; int tz = threadIdx.z;
int bx = blockIdx.x; int by = blockIdx.y; int bz = blockIdx.z;
int x_o = bx * TILE_X + tx
int y_o = by * TILE_Y + ty;
int z_o = bz * TILE_Z + tz;
int x_i = x_o - MASK_WIDTH/2;
int y_i = y_o - MASK_WIDTH/2;
int z_i = z_o - MASK_WIDTH/2;
if (x_i >= 0 && y_i >= 0 && z_i >= 0 && x_i < x_size && y_i < y_size && z_i < z_size)
inputTile[tz][ty][tx] = input[(z_i * y_size + y_i) * x_size + x_i];
else
inputTile[tz][ty][tx] = 0.0;
__syncthreads();
float acc = 0.0;
if(tz < TILE_Z && ty < TILE_Y && tx < TILE_X) {
for(int z_mask = 0; z_mask < Z_MASK_WIDTH; z_mask++) {
for(int y_mask = 0; y_mask < Y_MASK_WIDTH; y_mask++) {
for(int x_mask = 0; x_mask < X_MASK_WIDTH; x_mask++) {
acc += mask[z_mask][y_mask][x_mask] *
inputTile[tz+z_mask][ty+y_mask][tx+x_mask];
}
}
}
if(z_o < z_size && y_o < y_size && x_o < x_size)
output[(z_o * y_size + y_o) * x_size + x_o] = acc;
}
}
Any idea about how to choose between these? In addition, which version is used more often in practice, like in deep learning? Also if you have any comments on the code, please also let me know!
The general answer whenever it comes to the question of "which is faster?" is always: measure how fast each approach runs your application scenario to find out. In this case, I would say that the first approach would seem preferable most of the time (if you had to pick one of those two options for some reason). Unless you have some very tiny convolution kernels, the second approach would have lots of threads idle in the parts that do much of the actual work. Be sure to avoid bank conflicts within your tiles and think about the memory access patterns you get from your warps when moving data to and from global memory.
In the end, convolution is basically just computing sums over all possible combinations of kernel coefficients and input elements. Since the workload is essentially just repeatedly fetching these values in some order, convolution is almost necessarily going to be limited by bandwidth. Thus, doing convolution efficiently comes down to optimizing memory access and reducing bandwidth as much as possible.
[…] which version is used more often in practice, like in deep learning?
Neither. The naïve approach of throwing nested loops at it to brute-force convolution in the spatial domain is almost never an efficient way of computing convolutions. Convolution is such a fundamental operation for so many things that it has been studied extensively. There are literally hundreds, if not thousands of papers and books you could read on the subject. In deep learning, the problem of convolution has commonly been formulated in terms of general matrix multiplications (GEMMs) since this approach leads to rather nice memory access patterns and many efficient GEMM implementations are available for the GPU. But also FFT-based approaches as well as other algorithms are increasingly used depending on the application.
This question already has an answer here:
CUDA-Kernel supposed to be dynamic crashes depending upon block size
(1 answer)
Closed 2 years ago.
I want to implement a simple matrix multiplication in CUDA. The dimensions of the matrix are determined at runtime and I also want to use the shared memory in order to gain a perfomance boost. I have implemented such a function but everytime I run it, I get this error:
mulKernel launch failed: an illegal memory access was encountered
I am also not sure if I can use malloc to allocate shared memory. However, if I want to use something like this
__shared__ float matrM_sm[tile_width][tile_width];
the compiler complains that tile_width has to be known at runtime...
I have tried everything I can think of and tried various suggestions as well but none of them worked. This is the function (the full working file can be found HERE):
__global__ void mulKernelSM(float *matrR, const float *matrM, const float *matrN,
const int m_x, const int m_y, const int n_x, const int n_y, const int tile_width)
{
int i, j;
extern __shared__ float shared[];
float *matrM_sm = shared;
float *matrN_sm = &shared[tile_width * tile_width];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * tile_width + ty;
int col = bx * tile_width + tx;
float tmp;
int limit = ceil(m_y / (float) tile_width);
for (i = 0; i < limit; i++)
{
tmp = 0.0;
if (i * tile_width + tx < m_y && row < m_x)
matrM_sm[ty * tile_width + tx] = matrM[row * m_y + (i * tile_width + tx)];
else
matrM_sm[ty * tile_width + tx] = 0.0;
if (i * tile_width + ty < n_x && col < n_y)
matrN_sm[ty * tile_width + tx] = matrN[col + (i * tile_width + ty) * n_y];
else
matrN_sm[ty * tile_width + tx] = 0.0;
__syncthreads();
for (j = 0; j < tile_width; j++)
tmp += matrM_sm[ty * tile_width + j] * matrN_sm[j * tile_width + tx];
__syncthreads();
}
if (row < m_x && col < n_y)
matrR[row * n_y + col] = tmp;
}
The basic layout should work as I have also implemented a version without shared memory which works just fine. The function without shared memory is listed below:
__global__ void mulKernel(float *matrR, const float *matrM, const float *matrN,
const int m_x, const int m_y, const int n_x, const int n_y)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int i;
if ((row < m_x) && (col < n_y))
{
float tmp = 0.0;
for (i = 0; i < m_y; i++)
{
tmp += matrM[row * m_y + i] * matrN[col + n_y * i];
}
matrR[row * n_y + col] = tmp;
}
}
If there is any information missing I will provide it immediatly.
You swapped row, col. Furthermore, I believe to get the global thread index you should rather do this int x_global = threadIdx.x + blockDim.x * threadIdx.y
I am trying to generate perlin noise for a math essay for school, and i have some difficulties figuring out the math behind it. This is my perlin class. The perlin noise function generates ( should generate) a number between 0 and 1, that i then multiply by 255 to apply color to every pixel on the screen, please help!
#include "perlinnoise.h"
perlinnoise::perlinnoise()
{
srand(time(NULL));
double random = rand() % 1000;
for (int i = 0; i < (651 * 2); i = i + 2)
{
random = (rand() % 1000);
vecGrad[i] = random / 1000;
vecGrad[i + 1] = vecGrad[i];
vecGrad[i] = cos(vecGrad[i] * 2 * 3.1416);
vecGrad[i + 1] = sin(vecGrad[i + 1] * 2 * 3.1416);
}
}
int perlinnoise::perlinNoise(int x, int y)
{
//20 pixel in each case
//30 boxes in width and 20 boxes in height
//651 vectors to create
sf::Vector2i boxXY;
boxXY.x = ((x / 20));
boxXY.y = ((y / 20));
sf::Vector2i displacement1; displacement1.x = x - boxXY.x * 20; displacement1.y = y - boxXY.y * 20;
sf::Vector2i displacement2; displacement2.x = x - (boxXY.x * 20 + 20); displacement2.y = y - boxXY.y * 20;
sf::Vector2i displacement3; displacement3.x = x - boxXY.x * 20; displacement3.y = y - (boxXY.y * 20 + 20);
sf::Vector2i displacement4; displacement4.x = x - (boxXY.x * 20 + 20); displacement4.y = y - (boxXY.y * 20 + 20);
/*std::cout << displacement1.x << std::endl; std::cout << displacement1.y << std::endl;
std::cout << displacement2.x << std::endl; std::cout << displacement2.y << std::endl;
std::cout << displacement3.x << std::endl; std::cout << displacement3.y << std::endl;
std::cout << displacement4.x << std::endl; std::cout << displacement4.y << std::endl;*/
double dotP1 = (vecGrad[((boxXY.y * 30) + boxXY.x)] * displacement1.x) + (vecGrad[(boxXY.y * 30) + boxXY.x + 1] * displacement1.y);
double dotP2 = (vecGrad[((boxXY.y * 30) + boxXY.x + 3)] * displacement2.x) + (vecGrad[(boxXY.y * 30) + boxXY.x + 4] * displacement2.y);
double dotP3 = (vecGrad[((boxXY.y * 30 + 1) + boxXY.x)] * displacement3.x) + (vecGrad[(boxXY.y * 30) + boxXY.x + 1] * displacement3.y);
double dotP4 = (vecGrad[((boxXY.y * 30 + 1) + boxXY.x + 3)] * displacement4.x) + (vecGrad[(boxXY.y * 30) + boxXY.x + 4] * displacement4.y);
This is where i have some troubles ( I think)
int intensity = 0;
double Sx = (3 * (x - boxXY.x * 20) * (x - boxXY.x * 20)) - (2 * (x - boxXY.x * 20) * (x - boxXY.x * 20) * (x - boxXY.x * 20));
double Sy = (3 * (y - boxXY.y * 20) * (y - boxXY.y * 20)) - (2 * (y - boxXY.y * 20) * (y - boxXY.y * 20) * (y - boxXY.y * 20));
double a = dotP1 + (Sx * (dotP2 - dotP1));
double b = dotP3 + (Sx * (dotP4 - dotP3));
double aa = dotP1 + (Sy * (dotP2 - dotP1));
double bb = dotP3 + (Sy * (dotP4 - dotP3));
intensity = (a+b+aa+bb)/4;
//Should generate number between 0 and 1, but doesn't :/
return intensity;
}
perlinnoise::~perlinnoise()
{
}
I've been reading lots of articles, and they are all very unclear about the math used.I ended up generating a grid with 20*20 pixels in each, with each cross section in the grid having a randomly generated gradient vector. I then calculate the displacement vectors and then do the dot product on the four corners with displacement and gradient vectors. This first part is a bit messy as i am not very experienced, but the last part is a bit more straightforward. I use a smoothing function on the x and y axis and use that number to generate a, b, aa and bb, and i then take the average of that. This is what i thought i understood from the articles i read, but apparently it's wrong :/ Any help please?
Thanks in advance!
When attempted to push back a vector of UINT, the progrma crashes with Critical error detected c0000374. Below is the initial code:
void Terrain::CreateIndexList(UINT Width, UINT Height){
UINT sz_iList = (Width - 1)*(Height - 1) * 6;
UINT *iList = new UINT[sz_iList];
for (int i = 0; i < Width; i++){
for (int j = 0; j < Height; j++){
iList[(i + j * (Width - 1)) * 6] = ((UINT)(2 * i));
iList[(i + j * (Width - 1)) * 6 + 1] = (UINT)(2 * i + 1);
iList[(i + j * (Width - 1)) * 6 + 2] = (UINT)(2 * i + 2);
iList[(i + j * (Width - 1)) * 6 + 3] = (UINT)(2 * i + 2);
iList[(i + j * (Width - 1)) * 6 + 4] = (UINT)(2 * i + 1);
iList[(i + j * (Width - 1)) * 6 + 5] = (UINT)(2 * i + 3);
}
}
for (int i = 0; i < sz_iList; i++){
Geometry.IndexVertexData.push_back(iList[i]);
}
delete[] iList;
}
The goal is to take the generated indices from the iList array and fill the Geometry.IndexVertexData vector array. While debugging this, I've created several other implementations of this:
//After creating the iList array:
Geometry.IndexVertexData.resize(sz_iList); //Fails with "Vector subscript out of range?"
UINT in = 0;
for (int i = 0; i < Width; i++){
for (int j = 0; j < Height; j++){
Geometry.IndexVertexData[(i + j*(Width - 1)) * 6] = iList[in];
in++;
Geometry.IndexVertexData[(i + j*(Width - 1)) * 6 + 1] = iList[in];
in++;
Geometry.IndexVertexData[(i + j*(Width - 1)) * 6 + 2] = iList[in];
in++;
Geometry.IndexVertexData[(i + j*(Width - 1)) * 6 + 3] = iList[in];
in++;
Geometry.IndexVertexData[(i + j*(Width - 1)) * 6 + 4] = iList[in];
in++;
Geometry.IndexVertexData[(i + j*(Width - 1)) * 6 + 5] = iList[in];
in++;
}
}
And a final, direct to vector implementation:
Geometry.IndexVertexData.reserve(sz_iList);
for (int index = 0; index < sz_iList; index+=6) {
Geometry.IndexVertexData[(i + j*(Width - 1)) * 6] = ((UINT)(2 * i));
Geometry.IndexVertexData[(i + j*(Width - 1)) * 6 + 1] = (UINT)(2 * i + 1);
Geometry.IndexVertexData[(i + j*(Width - 1)) * 6 + 2] = (UINT)(2 * i + 2);
Geometry.IndexVertexData[(i + j*(Width - 1)) * 6 + 3] = (UINT)(2 * i + 2);
Geometry.IndexVertexData[(i + j*(Width - 1)) * 6 + 4] = (UINT)(2 * i + 1);
Geometry.IndexVertexData[(i + j*(Width - 1)) * 6 + 5] = (UINT)(2 * i + 3);
}
sz_iList has a final value of 2166, resultant from a grid of 20x20 (400 total points) and is used to initialize sizes. In all cases, the vector array would not fully fill, crashing with Critical error detected c0000374. Am I doing something wrong?
Your sz_iList doesn't appear to be big enough. Let's use a simple example of Width = Height = 2;, then sz_iList = (2 - 1) * (2 - 1) * 6 = 6, right? But in your nested loops, the last iteration occurs when i = j = 1 (i is one less than Width and j is one less than Height), where (in the last line of your loop), you try to access element (i + j * (Width - 1)) * 6 + 5 = (1 + 1 * (2 - 1)) * 6 + 5 = (1 + 1 * 1) * 6 + 5 = 2 * 6 + 5 = 17, which is bigger than the size of your array. This results in undefined behavior.
I'm looking for some help figuring out how to remove some low quality pixel noise from a video, that I'm obtaining from an xbox kinect via open frameworks. I'm running logic against "moving" parts of an image, to determine what color is moving the most, and use those regions to also detect the depth of which those pixels are moving. I'm attaching a photo to try to better explain my issue.
http://imago.bryanmoyles.com/xxw80
Of course I know code will be asked for, so I'll post what I have so far, but what I'm looking for more than anything else, is a good algorithm for smoothing out pixelated regions in a photo using C++
for(int y = 0; y < kinect.height; y += grid_size) {
for(int x = 0; x < kinect.width * 3; x += 3 * grid_size) {
unsigned int total_r = 0, total_b = 0, total_g = 0;
for(int r = 0; r < grid_size; r++) {
for(int c = 0; c < grid_size; c++) {
total_r += color_pixels[(y * kinect.width * 3 + r * kinect.width * 3) + (c * 3 + x + 0)];
total_b += color_pixels[(y * kinect.width * 3 + r * kinect.width * 3) + (c * 3 + x + 1)];
total_g += color_pixels[(y * kinect.width * 3 + r * kinect.width * 3) + (c * 3 + x + 2)];
}
}
unsigned char average_r = total_r / (grid_size * grid_size),
average_b = total_b / (grid_size * grid_size),
average_g = total_g / (grid_size * grid_size);
for(int r = 0; r < grid_size; r++) {
for(int c = 0; c < grid_size; c++) {
color_pixels[(y * kinect.width * 3 + r * kinect.width * 3) + (c * 3 + x + 0)] = average_r;
color_pixels[(y * kinect.width * 3 + r * kinect.width * 3) + (c * 3 + x + 1)] = average_b;
color_pixels[(y * kinect.width * 3 + r * kinect.width * 3) + (c * 3 + x + 2)] = average_g;
}
}
}
}
for(int y = 0; y < kinect.height; y++) {
for (int x = 0; x < kinect.width * 3; x += 3) {
int total_difference = abs(color_pixels[y * kinect.width * 3 + x + 0] - rgb[0])
+ abs(color_pixels[y * kinect.width * 3 + x + 1] - rgb[1])
+ abs(color_pixels[y * kinect.width * 3 + x + 2] - rgb[2]);
unsigned char defined_color;
if(total_difference < 40) {
defined_color = (unsigned char) 255;
} else {
defined_color = (unsigned char) 0;
}
color_pixels[y * kinect.width * 3 + x + 0] = defined_color;
color_pixels[y * kinect.width * 3 + x + 1] = defined_color;
color_pixels[y * kinect.width * 3 + x + 2] = defined_color;
}
}
Again, I'd like to reiterate that my code is not the problem, I'm simply posting it here so that you understand I'm not just asking blindly. What I really need, is some direction on how to smooth out pixelated images, so that my averages don't get messed up frame by frame by poor quality.
You can process your image from the camera with some methods from the ofxOpenCV addon. There you will have methods like blur, undistort, erode, etc. Its easy to setup, because its already an addon. Have a look at the openCvExample which should be packed with your openFrameworks. For more information on the mentioned methods, take a look here. If I understand your problem correctly, then a little blur on the image could fix your problem already.