MVAPICH hangs on MPI_Send for message larger than eager threshold

MVAPICH hangs on MPI_Send for message larger than eager threshold - c++

There is a simple program in c++ / mpi (mvapich), which sends an array of type float. When i use MPI_Send,MPI_Ssend,MPI_Rsend ,if the size of the data is more than the eager threshold(64k in my program), then during the call MPI_Send my program hangs. If array is smaller than the threshold, program works fine.Source code is bellow:
#include "mpi.h"
#include <unistd.h>
#include <stdio.h>
int main(int argc,char *argv[]) {
int mype=0,size=1;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD,&mype);
MPI_Comm_size(MPI_COMM_WORLD,&size);
int num = 2048*2048;
float* h_pos = new float[num];
MPI_Status stat;
if(mype == 0)
{
MPI_Rsend(h_pos, 20000, MPI_FLOAT, 1, 5, MPI_COMM_WORLD);
}
if(mype == 1)
{
printf("%fkb\n", 20000.0f*sizeof(float)/1024);
MPI_Recv(h_pos, 20000, MPI_FLOAT, 0, 5, MPI_COMM_WORLD, &stat);
}
MPI_Finalize();
return 0;
}
I think my settings may be wrong，Parameters is bellow:
MVAPICH2 All Parameters
MV2_COMM_WORLD_LOCAL_RANK : 0
PMI_ID : 0
MPIRUN_RSH_LAUNCH : 0
MPISPAWN_GLOBAL_NPROCS : 2
MPISPAWN_MPIRUN_HOST : g718a
MPISPAWN_MPIRUN_ID : 10800
MPISPAWN_NNODES : 1
MPISPAWN_WORKING_DIR : /home/g718a/new_workspace/mpi_test
USE_LINEAR_SSH : 1
PMI_PORT : g718a:42714
MV2_3DTORUS_SUPPORT : 0
MV2_NUM_SA_QUERY_RETRIES : 20
MV2_NUM_SLS : 8
MV2_DEFAULT_SERVICE_LEVEL : 0
MV2_PATH_SL_QUERY : 0
MV2_USE_QOS : 0
MV2_ALLGATHER_BRUCK_THRESHOLD : 524288
MV2_ALLGATHER_RD_THRESHOLD : 81920
MV2_ALLGATHER_REVERSE_RANKING : 1
MV2_ALLGATHERV_RD_THRESHOLD : 0
MV2_ALLREDUCE_2LEVEL_MSG : 262144
MV2_ALLREDUCE_SHORT_MSG : 2048
MV2_ALLTOALL_MEDIUM_MSG : 16384
MV2_ALLTOALL_SMALL_MSG : 2048
MV2_ALLTOALL_THROTTLE_FACTOR : 4
MV2_BCAST_TWO_LEVEL_SYSTEM_SIZE : 64
MV2_GATHER_SWITCH_PT : 0
MV2_INTRA_SHMEM_REDUCE_MSG : 2048
MV2_KNOMIAL_2LEVEL_BCAST_MESSAGE_SIZE_THRESHOLD : 2048
MV2_KNOMIAL_2LEVEL_BCAST_SYSTEM_SIZE_THRESHOLD : 64
MV2_KNOMIAL_INTER_LEADER_THRESHOLD : 65536
MV2_KNOMIAL_INTER_NODE_FACTOR : 4
MV2_KNOMIAL_INTRA_NODE_FACTOR : 4
MV2_KNOMIAL_INTRA_NODE_THRESHOLD : 131072
MV2_RED_SCAT_LARGE_MSG : 524288
MV2_RED_SCAT_SHORT_MSG : 64
MV2_REDUCE_2LEVEL_MSG : 16384
MV2_REDUCE_SHORT_MSG : 8192
MV2_SCATTER_MEDIUM_MSG : 0
MV2_SCATTER_SMALL_MSG : 0
MV2_SHMEM_ALLREDUCE_MSG : 32768
MV2_SHMEM_COLL_MAX_MSG_SIZE : 131072
MV2_SHMEM_COLL_NUM_COMM : 8
MV2_SHMEM_COLL_NUM_PROCS : 2
MV2_SHMEM_COLL_SPIN_COUNT : 5
MV2_SHMEM_REDUCE_MSG : 4096
MV2_USE_BCAST_SHORT_MSG : 16384
MV2_USE_DIRECT_GATHER : 1
MV2_USE_DIRECT_GATHER_SYSTEM_SIZE_MEDIUM : 1024
MV2_USE_DIRECT_GATHER_SYSTEM_SIZE_SMALL : 384
MV2_USE_DIRECT_SCATTER : 1
MV2_USE_OSU_COLLECTIVES : 1
MV2_USE_OSU_NB_COLLECTIVES : 1
MV2_USE_KNOMIAL_2LEVEL_BCAST : 1
MV2_USE_KNOMIAL_INTER_LEADER_BCAST : 1
MV2_USE_SCATTER_RD_INTER_LEADER_BCAST : 1
MV2_USE_SCATTER_RING_INTER_LEADER_BCAST : 1
MV2_USE_SHMEM_ALLREDUCE : 1
MV2_USE_SHMEM_BARRIER : 1
MV2_USE_SHMEM_BCAST : 1
MV2_USE_SHMEM_COLL : 1
MV2_USE_SHMEM_REDUCE : 1
MV2_USE_TWO_LEVEL_GATHER : 1
MV2_USE_TWO_LEVEL_SCATTER : 1
MV2_USE_XOR_ALLTOALL : 1
MV2_DEFAULT_SRC_PATH_BITS : 0
MV2_DEFAULT_STATIC_RATE : 0
MV2_DEFAULT_TIME_OUT : 67374100
MV2_DEFAULT_MTU : 0
MV2_DEFAULT_PKEY : 0
MV2_DEFAULT_PORT : -1
MV2_DEFAULT_GID_INDEX : 0
MV2_DEFAULT_PSN : 0
MV2_DEFAULT_MAX_RECV_WQE : 128
MV2_DEFAULT_MAX_SEND_WQE : 64
MV2_DEFAULT_MAX_SG_LIST : 1
MV2_DEFAULT_MIN_RNR_TIMER : 12
MV2_DEFAULT_QP_OUS_RD_ATOM : 257
MV2_DEFAULT_RETRY_COUNT : 67900423
MV2_DEFAULT_RNR_RETRY : 202639111
MV2_DEFAULT_MAX_CQ_SIZE : 40000
MV2_DEFAULT_MAX_RDMA_DST_OPS : 4
MV2_INITIAL_PREPOST_DEPTH : 10
MV2_IWARP_MULTIPLE_CQ_THRESHOLD : 32
MV2_NUM_HCAS : 1
MV2_NUM_NODES_IN_JOB : 1
MV2_NUM_PORTS : 1
MV2_NUM_QP_PER_PORT : 1
MV2_MAX_RDMA_CONNECT_ATTEMPTS : 10
MV2_ON_DEMAND_UD_INFO_EXCHANGE : 1
MV2_PREPOST_DEPTH : 64
MV2_HOMOGENEOUS_CLUSTER : 0
MV2_COALESCE_THRESHOLD : 6
MV2_DREG_CACHE_LIMIT : 0
MV2_IBA_EAGER_THRESHOLD : 0
MV2_MAX_INLINE_SIZE : 0
MV2_MAX_R3_PENDING_DATA : 524288
MV2_MED_MSG_RAIL_SHARING_POLICY : 0
MV2_NDREG_ENTRIES : 0
MV2_NUM_RDMA_BUFFER : 0
MV2_NUM_SPINS_BEFORE_LOCK : 2000
MV2_POLLING_LEVEL : 1
MV2_POLLING_SET_LIMIT : -1
MV2_POLLING_SET_THRESHOLD : 256
MV2_R3_NOCACHE_THRESHOLD : 32768
MV2_R3_THRESHOLD : 4096
MV2_RAIL_SHARING_LARGE_MSG_THRESHOLD : 16384
MV2_RAIL_SHARING_MED_MSG_THRESHOLD : 2048
MV2_RAIL_SHARING_POLICY : 4
MV2_RDMA_EAGER_LIMIT : 32
MV2_RDMA_FAST_PATH_BUF_SIZE : 4096
MV2_RDMA_NUM_EXTRA_POLLS : 1
MV2_RNDV_EXT_SENDQ_SIZE : 5
MV2_RNDV_PROTOCOL : 3
MV2_SMALL_MSG_RAIL_SHARING_POLICY : 0
MV2_SPIN_COUNT : 5000
MV2_SRQ_LIMIT : 30
MV2_SRQ_MAX_SIZE : 4096
MV2_SRQ_SIZE : 256
MV2_STRIPING_THRESHOLD : 8192
MV2_USE_COALESCE : 0
MV2_USE_XRC : 0
MV2_VBUF_MAX : -1
MV2_VBUF_POOL_SIZE : 512
MV2_VBUF_SECONDARY_POOL_SIZE : 256
MV2_VBUF_TOTAL_SIZE : 0
MV2_USE_HWLOC_CPU_BINDING : 1
MV2_ENABLE_AFFINITY : 1
MV2_ENABLE_LEASTLOAD : 0
MV2_SMP_BATCH_SIZE : 8
MV2_SMP_EAGERSIZE : 65537
MV2_SMPI_LENGTH_QUEUE : 262144
MV2_SMP_NUM_SEND_BUFFER : 256
MV2_SMP_SEND_BUF_SIZE : 131072
MV2_USE_SHARED_MEM : 1
MV2_CUDA_BLOCK_SIZE : 0
MV2_CUDA_NUM_RNDV_BLOCKS : 8
MV2_CUDA_VECTOR_OPT : 1
MV2_CUDA_KERNEL_OPT : 1
MV2_EAGER_CUDAHOST_REG : 0
MV2_USE_CUDA : 1
MV2_CUDA_NUM_EVENTS : 64
MV2_CUDA_IPC : 1
MV2_CUDA_IPC_THRESHOLD : 0
MV2_CUDA_ENABLE_IPC_CACHE : 0
MV2_CUDA_IPC_MAX_CACHE_ENTRIES : 1
MV2_CUDA_IPC_NUM_STAGE_BUFFERS : 2
MV2_CUDA_IPC_STAGE_BUF_SIZE : 524288
MV2_CUDA_IPC_BUFFERED : 1
MV2_CUDA_IPC_BUFFERED_LIMIT : 33554432
MV2_CUDA_IPC_SYNC_LIMIT : 16384
MV2_CUDA_USE_NAIVE : 1
MV2_CUDA_REGISTER_NAIVE_BUF : 524288
MV2_CUDA_GATHER_NAIVE_LIMIT : 32768
MV2_CUDA_SCATTER_NAIVE_LIMIT : 2048
MV2_CUDA_ALLGATHER_NAIVE_LIMIT : 1048576
MV2_CUDA_ALLGATHERV_NAIVE_LIMIT : 524288
MV2_CUDA_ALLTOALL_NAIVE_LIMIT : 262144
MV2_CUDA_ALLTOALLV_NAIVE_LIMIT : 262144
MV2_CUDA_BCAST_NAIVE_LIMIT : 2097152
MV2_CUDA_GATHERV_NAIVE_LIMIT : 0
MV2_CUDA_SCATTERV_NAIVE_LIMIT : 16384
MV2_CUDA_ALLTOALL_DYNAMIC : 1
MV2_CUDA_ALLGATHER_RD_LIMIT : 1024
MV2_CUDA_ALLGATHER_FGP : 1
MV2_SMP_CUDA_PIPELINE : 1
MV2_CUDA_INIT_CONTEXT : 1
MV2_SHOW_ENV_INFO : 2
MV2_DEFAULT_PUT_GET_LIST_SIZE : 200
MV2_EAGERSIZE_1SC : 0
MV2_GET_FALLBACK_THRESHOLD : 0
MV2_PIN_POOL_SIZE : 2097152
MV2_PUT_FALLBACK_THRESHOLD : 0
MV2_ASYNC_THREAD_STACK_SIZE : 1048576
MV2_THREAD_YIELD_SPIN_THRESHOLD : 5
MV2_USE_HUGEPAGES : 1
and Configurations:
mpiname -a
MVAPICH2 2.0 Fri Jun 20 20:00:00 EDT 2014 ch3:mrail
Compilation
CC: gcc -DNDEBUG -DNVALGRIND -O2
CXX: g++ -DNDEBUG -DNVALGRIND
F77: no -L/lib -L/lib
FC: no
Configuration
-with-device=ch3:mrail --with-rdma=gen2 --enable-cuda --disable-f77 --disable-fc --disable-mcast
The program runs on 2 processes:
mpirun_rsh -hostfile hosts -n 2 MV2_USE_CUDA=1 MV2_SHOW_ENV_INFO=2 ./myTest
Any ideas?

The MPI Standard specifies that
A send that uses the ready communication mode may be started only if the matching receive is already posted. Otherwise, the operation is erroneous and its outcome is undefined.
In this program there is no guarantee that the Recv will be posted before the Rsend, so the operation may fail or hang.

I have run this on my laptop with 781.2 KiB without any deadlock. Ran it on a Blue Gene/Q with 781.2 KiB without any deadlock. So, thanks for the short test case, but I'm sorry I cannot reproduce your issue. Maybe it's specific to infiniband?
The general solution in this case is to post non-blocking sends and receives. I can provide code, but you're asking about ready-send and the eager threshold, so I'm pretty sure you know about those already and must have a good reason not to use them...

I just ran your test case using MVAPICH2-2.0 on an InfiniBand system, and I was not able to reproduce the hang. Would you be able to post a debug trace of the process which is hanging?
$ gdb attach <PID>
gdb> thread apply all bt

Related

How does flow classify example in DPDK works?

I want to test the flow classify example in DPDK 20.08 and I'm trying to modify the given ACL rules file to match all the TCP packets.
#file format:
#src_ip/masklen dst_ip/masklen src_port : mask dst_port : mask proto/mask priority
#
2.2.2.3/24 2.2.2.7/24 32 : 0xffff 33 : 0xffff 17/0xff 0
9.9.9.3/24 9.9.9.7/24 32 : 0xffff 33 : 0xffff 17/0xff 1
9.9.9.3/24 9.9.9.7/24 32 : 0xffff 33 : 0xffff 6/0xff 2
9.9.8.3/24 9.9.8.7/24 32 : 0xffff 33 : 0xffff 6/0xff 3
6.7.8.9/24 2.3.4.5/24 32 : 0x0000 33 : 0x0000 132/0xff 4
6.7.8.9/32 192.168.0.36/32 10 : 0xffff 11 : 0xffff 6/0xfe 5
6.7.8.9/24 192.168.0.36/24 10 : 0xffff 11 : 0xffff 6/0xfe 6
6.7.8.9/16 192.168.0.36/16 10 : 0xffff 11 : 0xffff 6/0xfe 7
6.7.8.9/8 192.168.0.36/8 10 : 0xffff 11 : 0xffff 6/0xfe 8
#error rules
#9.8.7.6/8 192.168.0.36/8 10 : 0xffff 11 : 0xffff 6/0xfe 9
Should I add 0.0.0.0/0 0.0.0.0/0 0 : 0x0000 0 : 0x0000 6/0xff 0 rule? I tried but there is still no packets matching.
ps:
This is the file I'm using.
#file format:
#src_ip/masklen dst_ip/masklen src_port : mask dst_port : mask proto/mask priority
#
2.2.2.3/24 2.2.2.7/24 32 : 0xffff 33 : 0xffff 17/0xff 0
9.9.9.3/24 9.9.9.7/24 32 : 0xffff 33 : 0xffff 17/0xff 1
9.9.9.3/24 9.9.9.7/24 32 : 0xffff 33 : 0xffff 6/0xff 2
9.9.8.3/24 9.9.8.7/24 32 : 0xffff 33 : 0xffff 6/0xff 3
6.7.8.9/24 2.3.4.5/24 32 : 0x0000 33 : 0x0000 132/0xff 4
6.7.8.9/32 192.168.0.36/32 10 : 0xffff 11 : 0xffff 6/0xfe 5
6.7.8.9/24 192.168.0.36/24 10 : 0xffff 11 : 0xffff 6/0xfe 6
6.7.8.9/16 192.168.0.36/16 10 : 0xffff 11 : 0xffff 6/0xfe 7
#6.7.8.9/8 192.168.0.36/8 10 : 0xffff 11 : 0xffff 6/0xfe 8
0.0.0.0/0 0.0.0.0/0 0 : 0x0000 0 : 0x0000 6/0xff 8
#error rules
#9.8.7.6/8 192.168.0.36/8 10 : 0xffff 11 : 0xffff 6/0xfe 9
I ran again, and it goes like:
rule [0] query failed ret [-22]
rule [1] query failed ret [-22]
rule [2] query failed ret [-22]
rule [3] query failed ret [-22]
rule [4] query failed ret [-22]
rule [5] query failed ret [-22]
rule [6] query failed ret [-22]
rule [7] query failed ret [-22]
rule[8] count=2
proto = 6
Segmentation fault
I don't know what is causing the Segmentation fault.
The command is sudo ./build/flow_classify -l 101 --log-level=pmd,8 -- --rule_ipv4="./ipv4_rules_file_pass.txt" > ~/flow_classify_log and I didn't change the source code.
I'm using a two port 82599 NIC. I'm putting the log file down below which contains the output before it shows Segmentation fault
flow_classify log
Sometimes it can process normally in the first iteration, and sometimes it can't.
update 1-3:
I modified the code to stop the packet forwarding and free every single packet received to check if it is the forwarding procedure that is causing the problem.
in main function:
/* if (nb_ports < 2 || (nb_ports & 1))
rte_exit(EXIT_FAILURE, "Error: number of ports must be even\n"); */
if (nb_ports < 1)
rte_exit(EXIT_FAILURE, "Error: no port avaliable\n");
in lcore_main function:
//in lcore_main function
/* Send burst of TX packets, to second port of pair. */
/* const uint16_t nb_tx = rte_eth_tx_burst(port ^ 1, 0,
bufs, nb_rx); */
const uint16_t nb_tx = 0;
/* Free any unsent packets. */
if (unlikely(nb_tx < nb_rx)) {
uint16_t buf;
for (buf = nb_tx; buf < nb_rx; buf++)
rte_pktmbuf_free(bufs[buf]);
}
And this is the new log, but I don't think there is any difference. I'm using only one of the two ports on a single 82599ES NIC. Maybe it's the false classification rule I added that is causing the problem, because it ran okay with the default rule settings.

Flow classify requires
minimum of 2 ports, always even ports.
Flow entries has to be populated in the valid format.
Entry in rules:
2.2.2.3/0 2.2.2.7/0 32 : 0xffff 33 : 0xffff 17/0xff 0
2.2.2.3/0 2.2.2.7/0 32 : 0x0 33 : 0x0 6/0xff 1
Packet send: ipv4-TCP
Log from flow-classify:
rule [0] query failed ret [-22] -- UDP lookup failed
rule[1] count=32 -- TCP lookup success
proto = 6
Virtual NIC: ./build/flow_classify -c 8 --no-pci --vdev=net_tap0 --vdev=net_tap1 -- --rule_ipv4="ipv4_rules_file.txt"
Physical NIC: ./build/flow_classify -c 8 -- --rule_ipv4="ipv4_rules_file.txt"
Hence issue faced at your end is because
incorrect configuration
were only using 1 port

(C++) Division of a negative double with a positive integer gives me positive result

I was assigned to do a small project in a programming class using NUCLEO F401RE development board. I chose to program using Mbed studio, as I think it is easier than programming on STM32 Cude IDE. However, I encountered a problem that some time when I divide negative floating point number with a positive integer, the result turned out to be positive. I decided to write a program to test the said division.
#include "mbed.h"
// main() runs in its own thread in the OS
int main()
{
int i = 1;
while (true) {
printf("Divided by %d : %f\n",i,-21.837046/i++);
}
}
Here is the result for the first 40 iteration
Divided by 1 : -21.837047
Divided by 2 : -10.918524
Divided by 3 : -7.279016
Divided by 4 : -5.459262
Divided by 5 : -4.367410
Divided by 6 : -3.639508
Divided by 7 : -3.119579
Divided by 8 : -2.729631
Divided by 9 : -2.426339
Divided by 10 : -2.183705
Divided by 11 : -1.985187
Divided by 12 : -1.819754
Divided by 13 : -1.679773
Divided by 14 : -1.559790
Divided by 15 : -1.455804
Divided by 16 : -1.364816
Divided by 17 : -1.284533
Divided by 18 : -1.213170
Divided by 19 : -1.149319
Divided by 20 : -1.091853
Divided by 21 : -1.039860
Divided by 22 : 0.992594
Divided by 23 : 0.949437
Divided by 24 : 0.909877
Divided by 25 : 0.873482
Divided by 26 : 0.839887
Divided by 27 : 0.808780
Divided by 28 : 0.779895
Divided by 29 : 0.753002
Divided by 30 : 0.727902
Divided by 31 : 0.704421
Divided by 32 : 0.682408
Divided by 33 : 0.661729
Divided by 34 : 0.642267
Divided by 35 : 0.623916
Divided by 36 : 0.606585
Divided by 37 : 0.590191
Divided by 38 : 0.574660
Divided by 39 : 0.559925
Divided by 40 : 0.545927
Notice that from i = 1 to i = 21 the results were negative as expected, but when i >= 22 the results were positive numbers. Interstingly, when I changed the numerator to a different number such as -30, the results transitioned from negative to positive at i >= 31. Different numerators have different values of i in which the results transition.
Divided by 1 : -30.000000
Divided by 2 : -15.000000
Divided by 3 : -10.000000
Divided by 4 : -7.500001
Divided by 5 : -6.000000
Divided by 6 : -5.000000
Divided by 7 : -4.285715
Divided by 8 : -3.750001
Divided by 9 : -3.333334
Divided by 10 : -3.000000
Divided by 11 : -2.727273
Divided by 12 : -2.500001
Divided by 13 : -2.307693
Divided by 14 : -2.142858
Divided by 15 : -2.000000
Divided by 16 : -1.875001
Divided by 17 : -1.764706
Divided by 18 : -1.666667
Divided by 19 : -1.578948
Divided by 20 : -1.500001
Divided by 21 : -1.428572
Divided by 22 : -1.363637
Divided by 23 : -1.304348
Divided by 24 : -1.250001
Divided by 25 : -1.200000
Divided by 26 : -1.153847
Divided by 27 : -1.111112
Divided by 28 : -1.071429
Divided by 29 : -1.034483
Divided by 30 : -1.000000
Divided by 31 : 0.967742
Divided by 32 : 0.937501
Divided by 33 : 0.909091
Divided by 34 : 0.882353
Divided by 35 : 0.857143
Divided by 36 : 0.833334
Divided by 37 : 0.810811
Divided by 38 : 0.789474
Divided by 39 : 0.769231
Divided by 40 : 0.750001

C++ GetProcessTimes() does not change value over time

I'm writing a C++ function for get CPU usage of specific process in Windows.
Many reference (like this) suggest using function GetProcessTimes for implementation.
However, I tried with a sample program but the value of KernelTime and UserTime is not changed over times.Below is my code:
#include <iostream>
#include <Windows.h>
int main()
{
int processID = 14532;
HANDLE processHandle = OpenProcess(PROCESS_QUERY_INFORMATION, FALSE, processID);
if (processHandle == NULL) {
return -1;
}
FILETIME ftProcCreation, ftProcExit, ftProcKernel, ftProcUser;
for (int i = 0; i < 10; i++) {
if (!GetProcessTimes(GetCurrentProcess(), &ftProcCreation,
&ftProcExit, &ftProcKernel, &ftProcUser)) {
return -1;
}
LARGE_INTEGER lKernel, lUser;
lKernel.LowPart = ftProcKernel.dwLowDateTime;
lKernel.HighPart = ftProcKernel.dwHighDateTime;
lUser.LowPart = ftProcUser.dwLowDateTime;
lUser.HighPart = ftProcUser.dwHighDateTime;
printf("%lld : %lld\n", lKernel.QuadPart, lUser.QuadPart);
Sleep(250);
}
}
The process that I inspect is a running Virtualbox process that always take about 20% of CPU.
However, when I run the sample code, the result is as below:
0 : 0
0 : 0
0 : 0
0 : 0
0 : 0
0 : 0
0 : 0
0 : 0
0 : 0
0 : 0
Sometimes, it might give following result:
312500 : 0
312500 : 0
312500 : 0
312500 : 0
312500 : 0
312500 : 0
312500 : 0
312500 : 0
312500 : 0
312500 : 0
Again, it might give following result:
156250 : 0
156250 : 0
156250 : 0
156250 : 0
156250 : 0
156250 : 0
156250 : 0
156250 : 0
156250 : 0
156250 : 0
Or might be:
0 : 156250
0 : 156250
0 : 156250
0 : 156250
0 : 156250
0 : 156250
0 : 156250
0 : 156250
0 : 156250
0 : 156250
And so on, but the value of "0:0" is more frequency.
Is there any wrong with my code, so that it cannot get Kernel/Idle time of the process? And why the value does not change over time?
=======

You're querying the time of the current process (GetCurrentProcess()), not the target process (processHandle). Since GetProcessTimes() returns the CPU time consumed by the process, but your process isn't consuming much time as it is mostly sleeping, the result changes very slowly.
So, pass processHandle instead of GetCurrentProcess() to GetProcessTimes().

Why is LCOV ignoring lines?

I tried running code coverage to see how good my test cases is. Then I found out there are some lines that got ignored by LCOV, like line 6,7,10,13 below. Why does LCOV does not take these into account when lines like line 6 (class domain { ) clearly ran during the test.
1 : #include "ros/ros.h"
2 : #include "gtest/gtest.h"
3 :
4 :
5 :
6 : class domath {
7 : public:
8 1 : int multiply(int a, int b) {return a * b;}
9 :
10 : };
11 :
12 5 : TEST(TestSuite, multiply_test_case1){
13 : domath math1;
14 1 : int a = 3;
15 1 : int b = 4;
16 1 : int c = math1.multiply(a,b);
17 1 : EXPECT_EQ(c,12) << "value should be 12";
18 :
19 1 : }
20 :
21 :
22 1 : int main(int argc, char **argv){
23 :
24 1 : testing::InitGoogleTest(&argc , argv);
25 1 : return RUN_ALL_TESTS();
26 :
27 3 : }
28 :

H264 getting frame height and width from sequence parameter set (SPS) NAL unit

I've been trying to find out how to calculate width a height from SPS nal unit. I have H264 video which has these parameters
h264 (High), yuvj420p(pc), 1280x720 [SAR 1:1 DAR 16:9], 20 fps, 20 tbr, 1200k tbn, 40 tbc
I've been searching for a formula which would calculate width (1280) and height (720) but haven't found any which would help me. Right now I'm using this formula and it works for most H264 streams but in this case height and width is 80x48
if(frame_cropping_flag) {
width = ((pic_width_in_mbs_minus1 +1)*16) - frame_crop_left_offset*2 - frame_crop_right_offset*2;
height= ((2 - frame_mbs_only_flag)* (pic_height_in_map_units_minus1 +1) * 16) - (frame_crop_top_offset * 2) - (frame_crop_bottom_offset * 2);
}
else {
width = ((pic_width_in_mbs_minus1 +1)*16);
height= ((2 - frame_mbs_only_flag)* (pic_height_in_map_units_minus1 +1) * 16);
}
here is SPS as base64
Z2QAKa2EBUViuKxUdCAqKxXFYqOhAVFYrisVHQgKisVxWKjoQFRWK4rFR0ICorFcVio6ECSFITk8nyfk/k/J8nm5s00IEkKQnJ5Pk/J/J+T5PNzZprQCgC3YCqQAAAMB4AAASwGBAAH0AAADAjKAve+F4RCNQA==
here is SPS that I've parsed:
======= SPS =======
profile_idc : 100
constraint_set0_flag : 0
constraint_set1_flag : 0
constraint_set2_flag : 0
constraint_set3_flag : 0
constraint_set4_flag : 0
constraint_set5_flag : 0
reserved_zero_2bits : 0
level_idc : 41
seq_parameter_set_id : 0
chroma_format_idc : 1
separate_colour_plane_flag : 0
bit_depth_luma_minus8 : 0
bit_depth_chroma_minus8 : 0
qpprime_y_zero_transform_bypass_flag : 0
seq_scaling_matrix_present_flag : 1
log2_max_frame_num_minus4 : 41
pic_order_cnt_type : 4
log2_max_pic_order_cnt_lsb_minus4 : 0
delta_pic_order_always_zero_flag : 0
offset_for_non_ref_pic : 0
offset_for_top_to_bottom_field : 0
num_ref_frames_in_pic_order_cnt_cycle : 0
num_ref_frames : 2
gaps_in_frame_num_value_allowed_flag : 0
pic_width_in_mbs_minus1 : 4
pic_height_in_map_units_minus1 : 2
frame_mbs_only_flag : 1
mb_adaptive_frame_field_flag : 0
direct_8x8_inference_flag : 0
frame_cropping_flag : 0
frame_crop_left_offset : 0
frame_crop_right_offset : 0
frame_crop_top_offset : 0
frame_crop_bottom_offset : 0
vui_parameters_present_flag : 0
=== VUI ===
aspect_ratio_info_present_flag : 0
aspect_ratio_idc : 0
sar_width : 0
sar_height : 0
overscan_info_present_flag : 0
overscan_appropriate_flag : 0
video_signal_type_present_flag : 0
video_format : 0
video_full_range_flag : 0
colour_description_present_flag : 0
colour_primaries : 0
transfer_characteristics : 0
matrix_coefficients : 0
chroma_loc_info_present_flag : 0
chroma_sample_loc_type_top_field : 0
chroma_sample_loc_type_bottom_field : 0
timing_info_present_flag : 0
num_units_in_tick : 0
time_scale : 0
fixed_frame_rate_flag : 0
nal_hrd_parameters_present_flag : 0
vcl_hrd_parameters_present_flag : 0
low_delay_hrd_flag : 0
pic_struct_present_flag : 0
bitstream_restriction_flag : 0
motion_vectors_over_pic_boundaries_flag : 0
max_bytes_per_pic_denom : 0
max_bits_per_mb_denom : 0
log2_max_mv_length_horizontal : 0
log2_max_mv_length_vertical : 0
num_reorder_frames : 0
max_dec_frame_buffering : 0
=== HRD ===
cpb_cnt_minus1 : 0
bit_rate_scale : 0
cpb_size_scale : 0
bit_rate_value_minus1[0] : 0
cpb_size_value_minus1[0] : 0
cbr_flag[0] : 0
initial_cpb_removal_delay_length_minus1 : 0
cpb_removal_delay_length_minus1 : 0
dpb_output_delay_length_minus1 : 0
time_offset_length : 0
I guess it has something to do with luma and chroma macroblocks size I've been able to calculate SubWidthC\SubHeightC and MbWidthC\MbHeightC. But I'm still confused what to do next.

Hello first of all you are parsing SPS incorrectly so you need to fix that. If you parse it correctly then you will have
pic_width_in_mbs_minus1 : 79
pic_height_in_map_units_minus1 : 44
frame_mbs_only_flag : 1
frame_cropping_flag : 0
If you calculate width and height using your formula then you will actualy have 1280x720
Anyway you should calculate height and width using SubWidth and SubHeight as follows:
int SubWidthC;
int SubHeightC;
if (sps->chroma_format_idc == 0 && sps->separate_colour_plane_flag == 0) { //monochrome
SubWidthC = SubHeightC = 0;
}
else if (sps->chroma_format_idc == 1 && sps->separate_colour_plane_flag == 0) { //4:2:0
SubWidthC = SubHeightC = 2;
}
else if (sps->chroma_format_idc == 2 && sps->separate_colour_plane_flag == 0) { //4:2:2
SubWidthC = 2;
SubHeightC = 1;
}
else if (sps->chroma_format_idc == 3) { //4:4:4
if (sps->separate_colour_plane_flag == 0) {
SubWidthC = SubHeightC = 1;
}
else if (sps->separate_colour_plane_flag == 1) {
SubWidthC = SubHeightC = 0;
}
}
int PicWidthInMbs = sps->pic_width_in_mbs_minus1 + 1;
int PicHeightInMapUnits = sps->pic_height_in_map_units_minus1 + 1;
int FrameHeightInMbs = (2 - sps->frame_mbs_only_flag) * PicHeightInMapUnits;
int crop_left = 0;
int crop_right = 0;
int crop_top = 0;
int crop_bottom = 0;
if (sps->frame_cropping_flag) {
crop_left = sps->frame_crop_left_offset;
crop_right = sps->frame_crop_right_offset;
crop_top = sps->frame_crop_top_offset;
crop_bottom = sps->frame_crop_bottom_offset;
}
int width = PicWidthInMbs * 16 - SubWidthC * (crop_left + crop_right);
int height = FrameHeightInMbs * 16 - SubHeightC * (2 - sps->frame_mbs_only_flag) * (crop_top + crop_bottom);

we now have an H.264 SPS parser in librem:
https://github.com/creytiv/rem/blob/master/include/rem_h264.h#L52
it can be used like this, to extract the resolution:
struct h264_sps sps;
struct vidsz vidsz;
h264_sps_decode(&sps, buf, len);
h264_sps_resolution(&sps, vidsz);
printf("resolution: %u x %u\n", vidsz.w, vidsz.h);

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

MVAPICH hangs on MPI_Send for message larger than eager threshold - c++

I just ran your test case using MVAPICH2-2.0 on an InfiniBand system, and I was not able to reproduce the hang. Would you be able to post a debug trace of the process which is hanging? $ gdb attach <PID> gdb> thread apply all bt

Related

How does flow classify example in DPDK works?

(C++) Division of a negative double with a positive integer gives me positive result

C++ GetProcessTimes() does not change value over time

Why is LCOV ignoring lines?

H264 getting frame height and width from sequence parameter set (SPS) NAL unit

Categories

Resources