atomicAdd half-precision floating-point (FP16) on CUDA Compute Capability 5.2 - c++

I am trying to atomically add a float value to a __half in CUDA 5.2. This architecture does support the __half data type and its conversion functions, but it does not include any arithmetic and atomic operations for halves, like atomicAdd().
I created the following atomicAdd() function wrapper with a special case for when half-precision arithmetic is unsupported. full example code
__device__ void atomic_add(__half* a, float b) {
#if __CUDA_ARCH__ >= 700 // CUDA 7.0 supports fp16 atomic add
atomicAdd(a, __float2half(b));
#else
atomicAdd(&__half2float(a), b); // Error: expression must be an lvalue
#endif
}
atomicAdd(&__half2float(a), b); does not work, because __half2float(a) is not an lvalue. I could make a an lvalue by creating a copy:
float a_tmp = __half2float(a);
atomicAdd(&a_tmp , b);
a = __float2half(a_tmp);
But now the atomic function doesn't serve any purpose because I'm working on a copy of the value I actually want to modify atomically.
Is there another way that I haven't thought of in which I could perform this operation?

As it happens, compute capability 5.2 devices basically don't support 16-bit atomics of any type. There is some evidence of this is in the programming guide, and furthermore if you try to use 16-bit (unsigned short) atomicCAS on an architecture less than cc7.0, you will get a compile error - its not supported, although that's not obvious from the programming guide. (Yes, I have already filed an internal bug 3845962 at NVIDIA to have the documentation improved in this respect.)
The programming guide does illustrate the general formula to do atomicCAS based custom atomics, and we will use that recipe. However the other "ingredient" is that we are going to have to realize this with a 32-bit atomic. Generally speaking, it is possible to use a larger atomic on a smaller data type - you just don't modify anything outside of the data region of interest.
But one of the requirements that comes out of this approach is that you must make sure that the atomic access will be legal. This means that you must allocate in units of 32-bits (for the 32-bit atomic) even though the type of interest is __half i.e. 16-bits.
With that proviso the general methodology is the same as is already covered in the programming guide and other SO questions.
The following is one possible approach:
$ cat t2173.cu
#include <cuda_fp16.h>
#include <iostream>
#include <cstdio>
// this requires a full 32-bit allocation at the atomic address
__device__ float my_float_half_atomicAdd(__half *a, float b){
bool uplo = ((unsigned long long)a)&2; // check if the atomic is for the upper or lower 16-bit quantity in the aligned 32-bit item
unsigned *addr = reinterpret_cast<unsigned *>(((unsigned long long)a)&0xFFFFFFFFFFFFFFFCULL); // get the 32-bit aligned address
unsigned old = *addr;
unsigned val;
do {
val = old;
float newval = __half2float(__ushort_as_half(uplo?((unsigned short)(val>>16)):((unsigned short)(val))))+b;
unsigned short newval_s = __half_as_ushort(__float2half(newval));
unsigned newval_u = val&(uplo?(0x0FFFFU):(0xFFFF0000U));
newval_u |= uplo?(((unsigned)newval_s)<<16):(newval_s);
old = atomicCAS(addr, old, newval_u);}
while (old != val);
return __half2float(__ushort_as_half(uplo?(old>>16):(old)));
}
__device__ float f_h_atomic_add(__half* a, float b) {
#if __CUDA_ARCH__ >= 700 // CUDA 7.0 supports fp16 atomic add
return __half2float(atomicAdd(a, __float2half(b)));
#else
return my_float_half_atomicAdd(a, b);
#endif
}
__global__ void k(__half *a, float b){
printf("%f\n", f_h_atomic_add(a, b));
}
int main(){
__half *a;
cudaMallocManaged(&a, 4); // must allocate 32-bit quantities
memset(a, 0, 4);
k<<<2,64>>>(a, 1.0f);
cudaDeviceSynchronize();
float val = __half2float(a[0]);
std::cout << val << std::endl;
}
$ nvcc -arch=sm_35 -o t2173 t2173.cu
nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
$ CUDA_VISIBLE_DEVICES="1" cuda-memcheck ./t2173
========= CUDA-MEMCHECK
0.000000
1.000000
2.000000
3.000000
8.000000
9.000000
10.000000
11.000000
16.000000
17.000000
18.000000
19.000000
24.000000
25.000000
26.000000
27.000000
4.000000
5.000000
6.000000
7.000000
12.000000
13.000000
14.000000
15.000000
20.000000
21.000000
22.000000
23.000000
28.000000
29.000000
30.000000
31.000000
32.000000
33.000000
34.000000
35.000000
40.000000
41.000000
42.000000
43.000000
48.000000
49.000000
50.000000
51.000000
57.000000
58.000000
59.000000
60.000000
36.000000
37.000000
38.000000
39.000000
44.000000
45.000000
46.000000
47.000000
52.000000
53.000000
54.000000
56.000000
61.000000
62.000000
63.000000
64.000000
89.000000
90.000000
91.000000
55.000000
65.000000
66.000000
67.000000
68.000000
73.000000
74.000000
75.000000
76.000000
81.000000
82.000000
83.000000
84.000000
92.000000
93.000000
94.000000
95.000000
69.000000
70.000000
71.000000
72.000000
77.000000
78.000000
79.000000
80.000000
85.000000
86.000000
87.000000
88.000000
123.000000
124.000000
125.000000
126.000000
99.000000
100.000000
101.000000
102.000000
107.000000
108.000000
109.000000
110.000000
115.000000
116.000000
117.000000
118.000000
127.000000
96.000000
97.000000
98.000000
103.000000
104.000000
105.000000
106.000000
111.000000
112.000000
113.000000
114.000000
119.000000
120.000000
121.000000
122.000000
128
========= ERROR SUMMARY: 0 errors
$
(With CUDA 11.4 at least, this methodology can work on devices all the way back to cc3.5, which is what is demonstrated above.)
FP16 has fairly limited range compared to FP32, so that is something to keep in mind when adding float quantities to __half values.

Related

MKL Rectangular Matrix inplace transpose: not using multiple cores?

I want an in place memory transpose of very large matrix. I am using mkl_simatcopy. But I am observing some performance issue while transposing inplace. I am currently using Intel(R) Xeon(R) CPU E7-8867 v4 # 2.40GHz having 72 physical cores and RedHat OS.
My observation is that, when I perform transpose operation, only single core is used and it is not using all cores. I have tried all environment variables like MK_NUM_THREADS, MKL_DYNAMIC="FALSE" etc. My compilation script is as follows :
gcc -std=c99 -m64 -I $MKLROOT/include transpose.c
${MKLROOT}/lib/intel64/libmkl_scalapack_ilp64.a -Wl,--start-group
${MKLROOT}/lib/intel64/libmkl_cdft_core.a
${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a
${MKLROOT}/lib/intel64/libmkl_tbb_thread.a
${MKLROOT}/lib/intel64/libmkl_core.a
${MKLROOT}/lib/intel64/libmkl_blacs_openmpi_ilp64.a -Wl,--end-group -lstdc++ -lpthread -lm -ldl -o transpose.out
Timings obtained are as follows
Sno. Rows Cols Time(in sec)
1 16384 8192 16
2 16384 32768 68
3 32768 65536 233
Data Type is float. Please let me know , if there is an efficient way to transpose inplace or how can we port it to multiple cores.
int main(int argc, char *argv[])
{
unsigned long noOfScan = atol(argv[1]);
unsigned long noOfPix = atol(argv[2]);
size_t nEle = noOfScan * noOfPix;
float *data = (float *)calloc(nEle, sizeof(float));
initalizeData(data, noOfScan, noOfPix);
//printdata(data,noOfScan,noOfPix);
//writeDataFile((char *)data,"BeforeTranspose.img",nEle*sizeof(float));
printf("After transpose \n\n");
long nt = mkl_get_max_threads();
printf("No Of threads are = %d \n", nt);
//mkl_set_num_threads_local(nt);
//mkl_set_num_threads(nt);
double time1 = cpuSecond();
mkl_simatcopy('R', 'T', noOfScan, noOfPix, 1, data, noOfPix, noOfScan);
printf("Time elapsed is %lf \n", cpuSecond() - time1);
memset(data, 0, nEle * sizeof(float));
free(data);
}
The answer from Intel's forum: mkl_simatcopy doesn't support multithreading.
Yes, this routine is not threaded. In the case, if you really need to have this routine threaded, please submit the feature requests to the intel online service center - https://supporttickets.intel.com/

why tanh has different results in OpenCL and C++ function

here is my OpenCL code.
#include <iostream>
#include <cmath>
#include <CL/cl.hpp>
int main(){
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
cl::Platform default_platform=all_platforms[0];
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
cl::Device default_device=all_devices[0];
std::cout<< "Using device: "<<default_device.getInfo<CL_DEVICE_NAME>()<<"\n";
cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(default_platform)(), 0};
cl::Context context = cl::Context(CL_DEVICE_TYPE_ALL, properties);
cl::Program::Sources sources;
std::string kernel_code=
" void __kernel simple_tanh(__global const float *A, __global float *B){ "
" B[get_global_id(0)]=tanh(A[get_global_id(0)]); "
" } ";
sources.push_back({kernel_code.c_str(),kernel_code.length()});
cl::Program program(context,sources);
if(program.build({default_device})!=CL_SUCCESS){
std::cout<<" Error building: "<<program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device)<<"\n";
exit(1);
}
cl::Buffer buffer_A(context,CL_MEM_READ_WRITE,sizeof(float));
cl::Buffer buffer_B(context,CL_MEM_READ_WRITE,sizeof(float));
float A[1]; A[0] = 0.0595172755420207977294921875000000000000f;
cl::CommandQueue queue(context,default_device);
queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,sizeof(float),A);
queue.finish();
cl::Kernel kernel=cl::Kernel(program,"simple_tanh");
kernel.setArg(0,buffer_A);
kernel.setArg(1,buffer_B);
queue.enqueueNDRangeKernel(kernel,cl::NullRange,cl::NDRange(1),cl::NullRange);
queue.finish();
float B[1];
queue.enqueueReadBuffer(buffer_B,CL_TRUE,0,sizeof(float),B);
printf("result: %.40f %.40f\n", tanh(A[0]), B[0]);
return 0;
}
after I compile with this cmd: g++ -std=c++0x hello.cc -lOpenCL -o hello, and run it. I got different results of tanh function.
Using device: Tahiti
result: 0.0594470988394579374913817559900053311139 0.0594470985233783721923828125000000000000
the first is the cpu result, and the second is the OpenCL function. which one should I trust?
When a kernel is unable to be vectorized by compiler(opencl), generated instructions could be scalar types. Then, x87 FPU computes 80 bit. SSE has precision more comparable to GPU, you need float4 or float8 in your kernel such that compiler can produce SSE/AVX which has closer precision to GPU.
Generally Intel's opencl compiler vectorizes better(for some old CPUs at least). Which implementation are you using? There can be differences even between GPUs but they all obey the rule of not crossing ULP limit. If you need more precision with GPU(and SSE/AVX), why not write you own series expansion function then? But it would make learning very slow but faster than a single FPU at least.
What is your CPU? What opencl platform are you using? Did you check generated codes for kernel with some profiler software or some kernel analyzer?
Above all, you shouldn't do this:
cl::NDRange(1)
unless it's learning purpose. This will have %99 kernel launch overhead, %1 data copy overhead and close to zero compute latency. Maybe thats why it's using 80-bit FPU instead of SSE(on CPU). Try computing for multiple-of-8 ndrange values or use float8 types in kernel to let compiler use vectorized instructions.
When global ndrange value is millions, it will have significant effect on learning time, not leraning iterations needed. If CPU can finish learning in 1 day with 1M iterations, maybe GPU can finish it in 1-hour even if it needs 10M iterations. Transcandental functions have high compute to data ratio so speed up ratio versus CPU is higher if you use more of them.
If you derive your own series expansion function to achieve more precision, it would still be much faster than single CPU core in this embarrassingly parallel kernel code.
If the neural network has only a few neurons, then maybe you can do N networks training at the same time and pick the best learner(if learning has any randomization)? So it picks even better results than CPU?

Process unaligned part of a double array, vectorize the rest

I am generating sse/avx instructions and currently i have to use unaligned load and stores. I operate on a float/double array and i will never know whether it will be aligned or not. So before vectorizing it, i would like to have a pre and possibly a post loop, which takes care about the unaligned part. The main vectorized loop operates then on the aligned part.
But how do i determine when an array is aligned? Can i check the pointer value? When should the pre-loop stop and the post-loop start?
Here is my simple code example:
void func(double * in, double * out, unsigned int size){
for( as long as in unaligned part ){
out[i] = do_something_with_array(in[i])
}
for( as long as aligned ){
awesome avx code that loads operates and stores 4 doubles
}
for( remaining part of array ){
out[i] = do_something_with_array(in[i])
}
}
Edit:
I have been thinking about it. Theoretically the pointer to the i'th element should be dividable (something like &a[i]%16==0) by 2,4,16,32 (depending whether it is double and whether it is sse or avx). So the first loop should cover up the elements, which are not dividable.
Practically i will try the compiler pragmas and flags out, to see what does the compiler produce. If no one gives a good answer i will then post my solution (if any) on weekend.
Here is some example C code that does what you want
#include <stdio.h>
#include <x86intrin.h>
#include <inttypes.h>
#define ALIGN 32
#define SIMD_WIDTH (ALIGN/sizeof(double))
int main(void) {
int n = 17;
int c = 1;
double* p = _mm_malloc((n+c) * sizeof *p, ALIGN);
double* p1 = p+c;
for(int i=0; i<n; i++) p1[i] = 1.0*i;
double* p2 = (double*)((uintptr_t)(p1+SIMD_WIDTH-1)&-ALIGN);
double* p3 = (double*)((uintptr_t)(p1+n)&-ALIGN);
if(p2>p3) p2 = p3;
printf("%p %p %p %p\n", p1, p2, p3, p1+n);
double *t;
for(t=p1; t<p2; t+=1) {
printf("a %p %f\n", t, *t);
}
puts("");
for(;t<p3; t+=SIMD_WIDTH) {
printf("b %p ", t);
for(int i=0; i<SIMD_WIDTH; i++) printf("%f ", *(t+i));
puts("");
}
puts("");
for(;t<p1+n; t+=1) {
printf("c %p %f\n", t, *t);
}
}
This generates a 32-byte aligned buffer but then offsets it by one double in size so it's no longer 32-byte aligned. It loops over scalar values until reaching 32-btye alignment, loops over the 32-byte aligned values, and then lastly finishes with another scalar loop for any remaining values which are not a multiple of the SIMD width.
I would argue that this kind of optimization only really makes a lot of sense for Intel x86 processors before Nehalem. Since Nehalem the latency and throughput of unaligned loads and stores are the same as for the aligned loads and stores. Additionally, since Nehalem the costs of the cache line splits is small.
There is one subtle point with SSE since Nehalem in that unaligned loads and stores cannot fold with other operations. Therefore, aligned loads and stores are not obsolete with SSE since Nehalem. So in principle this optimization could make a difference even with Nehalem but in practice I think there are few cases where it will.
However, with AVX unaligned loads and stores can fold so the aligned loads and store instructions are obsolete.
I looked into this with GCC, MSVC, and Clang. GCC if it cannot assume a pointer is aligned to e.g. 16 bytes with SSE then it will generate code similar to the code above to reach 16 byte alignment to avoid the cache line splits when vectorizing.
Clang and MSVC don't do this so they would suffer from the cache-line splits. However, the cost of the additional code to do this makes up for cost of the cache-line splits which probably explains why Clang and MSVC don't worry about it.
The only exception is before Nahalem. In this case GCC is much faster than Clang and MSVC when the pointer is not aligned. If the pointer is aligned and Clang knows it then it will use aligned loads and stores and be fast like GCC. MSVC vectorization still uses unaligned stores and loads and is therefore slow pre-Nahalem even when a pointer is 16-byte aligned.
Here is a version which I think is a bit clearer using pointer differences
#include <stdio.h>
#include <x86intrin.h>
#include <inttypes.h>
#define ALIGN 32
#define SIMD_WIDTH (ALIGN/sizeof(double))
int main(void) {
int n = 17, c =1;
double* p = _mm_malloc((n+c) * sizeof *p, ALIGN);
double* p1 = p+c;
for(int i=0; i<n; i++) p1[i] = 1.0*i;
double* p2 = (double*)((uintptr_t)(p1+SIMD_WIDTH-1)&-ALIGN);
double* p3 = (double*)((uintptr_t)(p1+n)&-ALIGN);
int n1 = p2-p1, n2 = p3-p2;
if(n1>n2) n1=n2;
printf("%d %d %d\n", n1, n2, n);
int i;
for(i=0; i<n1; i++) {
printf("a %p %f\n", &p1[i], p1[i]);
}
puts("");
for(;i<n2; i+=SIMD_WIDTH) {
printf("b %p ", &p1[i]);
for(int j=0; j<SIMD_WIDTH; j++) printf("%f ", p1[i+j]);
puts("");
}
puts("");
for(;i<n; i++) {
printf("c %p %f\n", &p1[i], p1[i]);
}
}

CUDA: Hitting the maximum possible value of blockIdx.x for compute capability 3.0

First I should say I'm quite new to programming in C++ (let alone CUDA), though it is what I first learned with about 184 years ago. I'd say I'm a bit out of touch with memory allocation, and datatype sizes, though I'm learning. Anyway here goes:
I have a GPU with compute capability 3.0 (It's a Geforce 660 GTX w/ 2GB of DRAM).
Going by ./deviceQuery found in the CUDA samples (and by other charts I've found online), my maximum grid size is listed:
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
At 2,147,483,647 (2^31-1) that x dimension is huge and kind of nice… YET, when I run my code, pushing beyond 65535 in the x dimension, things get... weird.
I used an example from an Udacity course, and modified it to test the extremes. I've kept the kernel code fairly simple to prove the point:
__global__ void referr(long int *d_out, long int *d_in){
long int idx = blockIdx.x;
d_out[idx] = idx;
}
Please note below the ARRAY_SIZE being the size of the grid, but also being the size of the array of integers on which to do operations. I am leaving the size of the blocks at 1x1x1. JUST for the sake of understanding the limitations, I KNOW that having this many operations with blocks of only 1 thread makes no sense, but I want to understand what's going on with the grid size limitations.
int main(int argc, char ** argv) {
const long int ARRAY_SIZE = 522744;
const long int ARRAY_BYTES = ARRAY_SIZE * sizeof(long int);
// generate the input array on the host
long int h_in[ARRAY_SIZE];
for (long int i = 0; i < ARRAY_SIZE; i++) {
h_in[i] = i;
}
long int h_out[ARRAY_SIZE];
// declare GPU memory pointers
long int *d_in;
long int *d_out;
// allocate GPU memory
cudaMalloc((void**) &d_in, ARRAY_BYTES);
cudaMalloc((void**) &d_out, ARRAY_BYTES);
// transfer the array to the GPU
cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
// launch the kernel with ARRAY_SIZE blocks in the x dimension, with 1 thread each.
referr<<<ARRAY_SIZE, 1>>>(d_out, d_in);
// copy back the result array to the CPU
cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
// print out the resulting array
for (long int i =0; i < ARRAY_SIZE; i++) {
printf("%li", h_out[i]);
printf(((i % 4) != 3) ? "\t" : "\n");
}
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
This works as expected with an ARRAY_SIZE at MOST of 65535. The last few lines of the output below
65516 65517 65518 65519
65520 65521 65522 65523
65524 65525 65526 65527
65528 65529 65530 65531
65532 65533 65534
If I push the ARRAY_SIZE beyond this the output gets really unpredictable and eventually if the number gets too high I get a Segmentation fault (core dumped) message… whatever that even means. Ie. with an ARRAY_SIZE of 65536:
65520 65521 65522 65523
65524 65525 65526 65527
65528 65529 65530 65531
65532 65533 65534 131071
Why is it now stating that the blockIdx.x for this last one is 131071?? That is 65535+65535+1. Weird.
Even weirder, when I set the ARRAY_SIZE to 65537 (65535+2) I get some seriously strange results for the last lines of the output.
65520 65521 65522 65523
65524 65525 65526 65527
65528 65529 65530 65531
65532 65533 65534 131071
131072 131073 131074 131075
131076 131077 131078 131079
131080 131081 131082 131083
131084 131085 131086 131087
131088 131089 131090 131091
131092 131093 131094 131095
131096 131097 131098 131099
131100 131101 131102 131103
131104 131105 131106 131107
131108 131109 131110 131111
131112 131113 131114 131115
131116 131117 131118 131119
131120 131121 131122 131123
131124 131125 131126 131127
131128 131129 131130 131131
131132 131133 131134 131135
131136 131137 131138 131139
131140 131141 131142 131143
131144 131145 131146 131147
131148 131149 131150 131151
131152 131153 131154 131155
131156 131157 131158 131159
131160 131161 131162 131163
131164 131165 131166 131167
131168 131169 131170 131171
131172 131173 131174 131175
131176 131177 131178 131179
131180 131181 131182 131183
131184 131185 131186 131187
131188 131189 131190 131191
131192 131193 131194 131195
131196 131197 131198 131199
131200
Isn't 65535 the limit for older GPUs? Why is my GPU "messing up" when I push past the 65535 barrier for the x grid dimension? Or is this by design? What in the world is going on?
Wow, sorry for the long question.
Any help to understand this would be greatly appreciated! Thanks!
You should be using proper CUDA error checking . And you should be compiling for a compute 3.0 architecture by specifying -arch=sm_30 when you compile with nvcc.

How to speed up floating-point to integer number conversion? [duplicate]

This question already has answers here:
What is the fastest way to convert float to int on x86
(10 answers)
Closed 8 years ago.
We're doing a great deal of floating-point to integer number conversions in our project. Basically, something like this
for(int i = 0; i < HUGE_NUMBER; i++)
int_array[i] = float_array[i];
The default C function which performs the conversion turns out to be quite time consuming.
Is there any work around (maybe a hand tuned function) which can speed up the process a little bit? We don't care much about a precision.
Most of the other answers here just try to eliminate loop overhead.
Only deft_code's answer gets to the heart of what is likely the real problem -- that converting floating point to integers is shockingly expensive on an x86 processor. deft_code's solution is correct, though he gives no citation or explanation.
Here is the source of the trick, with some explanation and also versions specific to whether you want to round up, down, or toward zero: Know your FPU
Sorry to provide a link, but really anything written here, short of reproducing that excellent article, is not going to make things clear.
inline int float2int( double d )
{
union Cast
{
double d;
long l;
};
volatile Cast c;
c.d = d + 6755399441055744.0;
return c.l;
}
// this is the same thing but it's
// not always optimizer safe
inline int float2int( double d )
{
d += 6755399441055744.0;
return reinterpret_cast<int&>(d);
}
for(int i = 0; i < HUGE_NUMBER; i++)
int_array[i] = float2int(float_array[i]);
The double parameter is not a mistake! There is way to do this trick with floats directly but it gets ugly trying to cover all the corner cases. In its current form this function will round the float the nearest whole number if you want truncation instead use 6755399441055743.5 (0.5 less).
I ran some tests on different ways of doing float-to-int conversion. The short answer is to assume your customer has SSE2-capable CPUs and set the /arch:SSE2 compiler flag. This will allow the compiler to use the SSE scalar instructions which are twice as fast as even the magic-number technique.
Otherwise, if you have long strings of floats to grind, use the SSE2 packed ops.
There's an FISTTP instruction in the SSE3 instruction set which does what you want, but as to whether or not it could be utilized and produce faster results than libc, I have no idea.
Is the time large enough that it outweighs the cost of starting a couple of threads?
Assuming you have a multi-core processor or multiple processors on your box that you could take advantage of, this would be a trivial task to parallelize across multiple threads.
The key is to avoid the _ftol() function, which is needlessly slow. Your best bet for long lists of data like this is to use the SSE2 instruction cvtps2dq to convert two packed floats to two packed int64s. Do this twice (getting four int64s across two SSE registers) and you can shuffle them together to get four int32s (losing the top 32 bits of each conversion result). You don't need assembly to do this; MSVC exposes compiler intrinsics to the relevant instructions -- _mm_cvtpd_epi32() if my memory serves me correctly.
If you do this it is very important that your float and int arrays be 16-byte aligned so that the SSE2 load/store intrinsics can work at maximum efficiency. Also, I recommend you software pipeline a little and process sixteen floats at once in each loop, eg (assuming that the "functions" here are actually calls to compiler intrinsics):
for(int i = 0; i < HUGE_NUMBER; i+=16)
{
//int_array[i] = float_array[i];
__m128 a = sse_load4(float_array+i+0);
__m128 b = sse_load4(float_array+i+4);
__m128 c = sse_load4(float_array+i+8);
__m128 d = sse_load4(float_array+i+12);
a = sse_convert4(a);
b = sse_convert4(b);
c = sse_convert4(c);
d = sse_convert4(d);
sse_write4(int_array+i+0, a);
sse_write4(int_array+i+4, b);
sse_write4(int_array+i+8, c);
sse_write4(int_array+i+12, d);
}
The reason for this is that the SSE instructions have a long latency, so if you follow a load into xmm0 immediately with a dependent operation on xmm0 then you will have a stall. Having multiple registers "in flight" at once hides the latency a little. (Theoretically a magic all-knowing compiler could alias its way around this problem but in practice it doesn't.)
Failing this SSE juju you can supply the /QIfist option to MSVC which will cause it to issue the single opcode fist instead of a call to _ftol; this means it will simply use whichever rounding mode happens to be set in the CPU without making sure it is ANSI C's specific truncate op. The Microsoft docs say /QIfist is deprecated because their floating point code is fast now, but a disassembler will show you that this is unjustifiedly optimistic. Even /fp:fast simply results to a call to _ftol_sse2, which though faster than the egregious _ftol is still a function call followed by a latent SSE op, and thus unnecessarily slow.
I'm assuming you're on x86 arch, by the way -- if you're on PPC there are equivalent VMX operations, or you can use the magic-number-multiply trick mentioned above followed by a vsel (to mask out the non-mantissa bits) and an aligned store.
You might be able to load all of the integers into the SSE module of your processor using some magic assembly code, then do the equivalent code to set the values to ints, then read them as floats. I'm not sure this would be any faster though. I'm not a SSE guru, so I don't know how to do this. Maybe someone else can chime in.
In Visual C++ 2008, the compiler generates SSE2 calls by itself, if you do a release build with maxed out optimization options, and look at a disassembly (though some conditions have to be met, play around with your code).
See this Intel article for speeding up integer conversions:
http://software.intel.com/en-us/articles/latency-of-floating-point-to-integer-conversions/
According to Microsoft, the /QIfist compiler option is deprecated in VS 2005 because integer conversion has been sped up. They neglect to say how it has been sped up, but looking at the disassembly listing might give a clue.
http://msdn.microsoft.com/en-us/library/z8dh4h17(vs.80).aspx
most c compilers generate calls to _ftol or something for every float to int conversion. putting a reduced floating point conformance switch (like fp:fast) might help - IF you understand AND accept the other effects of this switch. other than that, put the thing in a tight assembly or sse intrinsic loop, IF you are ok AND understand the different rounding behavior.
for large loops like your example you should write a function that sets up floating point control words once and then does the bulk rounding with only fistp instructions and then resets the control word - IF you are ok with an x86 only code path, but at least you will not change the rounding.
read up on the fld and fistp fpu instructions and the fpu control word.
What compiler are you using? In Microsoft's more recent C/C++ compilers, there is an option under C/C++ -> Code Generation -> Floating point model, which has options: fast, precise, strict. I think precise is the default, and works by emulating FP operations to some extent. If you are using a MS compiler, how is this option set? Does it help to set it to "fast"? In any case, what does the disassembly look like?
As thirtyseven said above, the CPU can convert float<->int in essentially one instruction, and it doesn't get any faster than that (short of a SIMD operation).
Also note that modern CPUs use the same FP unit for both single (32 bit) and double (64 bit) FP numbers, so unless you are trying to save memory storing a lot of floats, there's really no reason to favor float over double.
On Intel your best bet is inline SSE2 calls.
I'm surprised by your result. What compiler are you using? Are you compiling with optimization turned all the way up? Have you confirmed using valgrind and Kcachegrind that this is where the bottleneck is? What processor are you using? What does the assembly code look like?
The conversion itself should be compiled to a single instruction. A good optimizing compiler should unroll the loop so that several conversions are done per test-and-branch. If that's not happening, you can unroll the loop by hand:
for(int i = 0; i < HUGE_NUMBER-3; i += 4) {
int_array[i] = float_array[i];
int_array[i+1] = float_array[i+1];
int_array[i+2] = float_array[i+2];
int_array[i+3] = float_array[i+3];
}
for(; i < HUGE_NUMBER; i++)
int_array[i] = float_array[i];
If your compiler is really pathetic, you might need to help it with the common subexpressions, e.g.,
int *ip = int_array+i;
float *fp = float_array+i;
ip[0] = fp[0];
ip[1] = fp[1];
ip[2] = fp[2];
ip[3] = fp[3];
Do report back with more info!
If you do not care very much about the rounding semantics, you can use the lrint() function. This allows for more freedom in rounding and it can be much faster.
Technically, it's a C99 function, but your compiler probably exposes it in C++. A good compiler will also inline it to one instruction (a modern G++ will).
lrint documentation
rounding only
excellent trick, only the use 6755399441055743.5 (0.5 less) to do rounding won't work.
6755399441055744 = 2^52 + 2^51 overflowing decimals off the end of the mantissa leaving the integer that you want in bits 51 - 0 of the fpu register.
In IEEE 754
6755399441055744.0 =
sign exponent mantissa
0 10000110011 1000000000000000000000000000000000000000000000000000
6755399441055743.5
will also however compile to
0100001100111000000000000000000000000000000000000000000000000000
the 0.5 overflows off the end (rounding up) which is why this works in the first place.
to do truncation you would have to add 0.5 to your double then do this
the guard digits should take care of rounding to the correct result done this way.
also watch out for 64 bit gcc linux where long rather annoyingly means a 64 bit integer.
If you have very large arrays (bigger than a few MB--the size of the CPU cache), time your code and see what the throughput is. You're probably saturating the memory bus, not the FP unit. Look up the maximum theoretical bandwidth for your CPU and see how close to it you are.
If you're being limited by the memory bus, extra threads will just make it worse. You need better hardware (e.g. faster memory, different CPU, different motherboard).
In response to Larry Gritz's comment...
You are correct: the FPU is a major bottleneck (and using the xs_CRoundToInt trick allows one to come very close to saturating the memory bus).
Here are some test results for a Core 2 (Q6600) processor. The theoretical main-memory bandwidth for this machine is 3.2 GB/s (L1 and L2 bandwidths are much higher). The code was compiled with Visual Studio 2008. Similar results for 32-bit and 64-bit, and with /O2 or /Ox optimizations.
WRITING ONLY...
1866359 ticks with 33554432 array elements (33554432 touched). Bandwidth: 1.91793 GB/s
154749 ticks with 262144 array elements (33554432 touched). Bandwidth: 23.1313 GB/s
108816 ticks with 8192 array elements (33554432 touched). Bandwidth: 32.8954 GB/s
USING CASTING...
5236122 ticks with 33554432 array elements (33554432 touched). Bandwidth: 0.683625 GB/s
2014309 ticks with 262144 array elements (33554432 touched). Bandwidth: 1.77706 GB/s
1967345 ticks with 8192 array elements (33554432 touched). Bandwidth: 1.81948 GB/s
USING xs_CRoundToInt...
1490583 ticks with 33554432 array elements (33554432 touched). Bandwidth: 2.40144 GB/s
1079530 ticks with 262144 array elements (33554432 touched). Bandwidth: 3.31584 GB/s
1008407 ticks with 8192 array elements (33554432 touched). Bandwidth: 3.5497 GB/s
(Windows) source code:
// floatToIntTime.cpp : Defines the entry point for the console application.
//
#include <windows.h>
#include <iostream>
using namespace std;
double const _xs_doublemagic = double(6755399441055744.0);
inline int xs_CRoundToInt(double val, double dmr=_xs_doublemagic) {
val = val + dmr;
return ((int*)&val)[0];
}
static size_t const N = 256*1024*1024/sizeof(double);
int I[N];
double F[N];
static size_t const L1CACHE = 128*1024/sizeof(double);
static size_t const L2CACHE = 4*1024*1024/sizeof(double);
static size_t const Sz[] = {N, L2CACHE/2, L1CACHE/2};
static size_t const NIter[] = {1, N/(L2CACHE/2), N/(L1CACHE/2)};
int main(int argc, char *argv[])
{
__int64 freq;
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
cout << "WRITING ONLY..." << endl;
for (int t=0; t<3; t++) {
__int64 t0,t1;
QueryPerformanceCounter((LARGE_INTEGER*)&t0);
size_t const niter = NIter[t];
size_t const sz = Sz[t];
for (size_t i=0; i<niter; i++) {
for (size_t n=0; n<sz; n++) {
I[n] = 13;
}
}
QueryPerformanceCounter((LARGE_INTEGER*)&t1);
double bandwidth = 8*niter*sz / (((double)(t1-t0))/freq) / 1024/1024/1024;
cout << " " << (t1-t0) << " ticks with " << sz
<< " array elements (" << niter*sz << " touched). "
<< "Bandwidth: " << bandwidth << " GB/s" << endl;
}
cout << "USING CASTING..." << endl;
for (int t=0; t<3; t++) {
__int64 t0,t1;
QueryPerformanceCounter((LARGE_INTEGER*)&t0);
size_t const niter = NIter[t];
size_t const sz = Sz[t];
for (size_t i=0; i<niter; i++) {
for (size_t n=0; n<sz; n++) {
I[n] = (int)F[n];
}
}
QueryPerformanceCounter((LARGE_INTEGER*)&t1);
double bandwidth = 8*niter*sz / (((double)(t1-t0))/freq) / 1024/1024/1024;
cout << " " << (t1-t0) << " ticks with " << sz
<< " array elements (" << niter*sz << " touched). "
<< "Bandwidth: " << bandwidth << " GB/s" << endl;
}
cout << "USING xs_CRoundToInt..." << endl;
for (int t=0; t<3; t++) {
__int64 t0,t1;
QueryPerformanceCounter((LARGE_INTEGER*)&t0);
size_t const niter = NIter[t];
size_t const sz = Sz[t];
for (size_t i=0; i<niter; i++) {
for (size_t n=0; n<sz; n++) {
I[n] = xs_CRoundToInt(F[n]);
}
}
QueryPerformanceCounter((LARGE_INTEGER*)&t1);
double bandwidth = 8*niter*sz / (((double)(t1-t0))/freq) / 1024/1024/1024;
cout << " " << (t1-t0) << " ticks with " << sz
<< " array elements (" << niter*sz << " touched). "
<< "Bandwidth: " << bandwidth << " GB/s" << endl;
}
return 0;
}