I am having a mental block as to how to calculate the mean values of one column over a range of values in another column, the range of which is determined by Start and end values in another data frame.
I know that this works for a single value:
mean(df$M100[df$Wavenumber..cm.1.==3997.96546], na.rm = T)
My problem is that the [df$Wavenumber..cm.1.== ...] needs to be over a range of values in another column in df (say 3963.68494:3953.68645), where 1st value is from df2$Start and the 2nd value is from df2$End. I have included examples of my two data sets below. I hope to be able to calculate a mean value for each M column in df over the Start and End ranges given in df2, so you would end up with 1 mean value per M column in df for each row of df2$Start & df2$End.
df2: structure(list(Location = c(3960.82823, 3923.691, 3919.40593,
3907.97909, 3886.55377), Height = c(0.163744751, 0.231555472,
0.232150996, 0.192475738, 0.162966924), Start = c(3963.68494,
3946.54468, 3920.83429, 3909.40745, 3895.1239), End = c(3953.68645,
3920.83429, 3909.40745, 3895.1239, 3883.69706)), row.names = c(NA,
5L), class = "data.frame")
df:structure(list(Wavenumber..cm.1. = c(3997.96546, 3996.5371, 3995.10875,
3993.68039, 3992.25204), M100 = c(0.00106, 0.00105, 0.00095,
0.00075, 0.00053), M101 = c(0.00081, 0.00092, 0.00102, 0.001,
0.00082), M102 = c(0.00099, 0.00109, 0.00105, 9e-04, 0.00072),
M103 = c(0.00101, 0.00111, 0.0012, 0.00129, 0.00133), M104 = c(0.00081,
0.00083, 0.00084, 0.00086, 0.00089), M105 = c(0.00139, 0.00113,
0.00092, 0.00089, 0.00102), M106 = c(0.00095, 0.00103, 0.00095,
0.00074, 0.00058), M107 = c(0.00054, 0.00058, 0.00059, 0.00049,
0.00032), M108 = c(0.00042, 5e-04, 5e-04, 0.00034, 0.00011
), M109 = c(0.00069, 0.00051, 0.00043, 0.00051, 0.00065),
M110 = c(0.00113, 0.00121, 0.00124, 0.00116, 0.00099), M111 = c(0.00039,
0.00056, 0.00068, 0.00068, 0.00056), M112 = c(0.0011, 0.00112,
0.00112, 0.00108, 0.00099), M113 = c(3e-04, 3e-04, 3e-04,
0.00027, 0.00019), M114 = c(0.00029, 6e-05, -2e-05, 9e-05,
0.00028), M115 = c(0.00091, 0.00079, 0.00061, 0.00038, 2e-04
), M116 = c(0.00117, 0.00105, 0.00096, 0.00092, 0.00092),
M117 = c(0.00039, 2e-04, 6e-05, 6e-05, 0.00018), M118 = c(0.00096,
0.00073, 0.00055, 0.00047, 0.00049), M119 = c(0.00037, 0.00031,
0.00024, 0.00018, 0.00018), M120 = c(0.00116, 0.00098, 0.00084,
0.00076, 0.00067), M121 = c(0.00039, 0.00024, 0.00011, 7e-05,
0.00011), M122 = c(0.00032, 0.00038, 0.00045, 0.00044, 0.00035
), M123 = c(9e-04, 0.00097, 0.00108, 0.0012, 0.00128), M124 = c(-0.00082,
-0.00065, -0.00049, -0.00037, -0.00036), M125 = c(0.00053,
0.00054, 0.00055, 6e-04, 0.00071), M126 = c(7e-05, 0.00022,
0.00022, 0.00011, 2e-05), M127 = c(0.00086, 9e-04, 0.00086,
0.00073, 0.00058), M128 = c(0.00089, 0.00078, 0.00069, 0.00057,
0.00043), M129 = c(0.00094, 0.00097, 0.00106, 0.00114, 0.00105
), M130 = c(0.0013, 0.00118, 0.00115, 0.00116, 0.00111),
M131 = c(0.00029, 0.00033, 0.00033, 3e-04, 0.00022), M132 = c(0,
0.00026, 0.00048, 6e-04, 0.00063), M133 = c(3e-05, -6e-05,
-6e-05, 5e-05, 0.00019), M134 = c(0.00056, 0.00054, 0.00052,
0.00054, 0.00057), M135 = c(2e-05, -4e-05, 6e-05, 0.00031,
0.00057), M136 = c(0.00083, 0.00075, 0.00068, 0.00068, 0.00073
), M137 = c(0.00064, 0.00074, 0.00084, 0.00095, 0.00105),
M139 = c(0.00044, 0.00044, 0.00042, 0.00043, 0.00047), M140 = c(0.00138,
0.00113, 0.00102, 0.0011, 0.00121), M141 = c(0.00062, 0.00043,
2e-04, 2e-05, 0), M142 = c(-0.00022, -0.00017, -0.00014,
-1e-04, 0), M143 = c(0.00109, 0.00108, 0.00103, 0.00093,
0.00087), M144 = c(0.00104, 0.00116, 0.00117, 0.00105, 0.00085
), M145 = c(7e-04, 0.00096, 0.00109, 0.00098, 0.00069), M146 = c(0.0014,
0.00158, 0.00165, 0.00154, 0.0013), M147 = c(6e-04, 0.00071,
0.00075, 0.00072, 0.00065), M148 = c(0.00098, 0.00093, 0.00091,
9e-04, 0.00088), M149 = c(0.00055, 0.00058, 0.00054, 0.00037,
0.00017), M150 = c(7e-04, 0.00068, 8e-04, 0.00107, 0.00132
), M151 = c(0.00037, 0.00042, 0.00046, 0.00047, 0.00046),
M152 = c(0.00047, 0.00042, 0.00043, 0.00045, 0.00045), M153 = c(0.00095,
0.00088, 0.00083, 8e-04, 0.00072), M154 = c(6e-05, 0.00013,
0.00032, 0.00054, 0.00062), M155 = c(0.00061, 0.00057, 0.00043,
0.00022, 4e-05), M156 = c(0.00077, 0.00078, 0.00071, 0.00052,
0.00025), M157 = c(0.00088, 0.00078, 0.00069, 0.00063, 0.00058
), M158 = c(0.00091, 0.00085, 0.00082, 0.00081, 8e-04), M159 = c(0.00078,
0.00076, 0.00073, 0.00074, 0.00079), M160 = c(0.00068, 7e-04,
0.00075, 8e-04, 0.00079), M161 = c(0.00055, 0.00073, 0.00082,
0.00085, 9e-04), M162 = c(0.00104, 0.00111, 0.0011, 0.00104,
0.00102), M163 = c(0.00076, 0.00071, 0.00069, 0.00068, 0.00067
), M164 = c(0.0012, 0.00133, 0.00154, 0.00174, 0.00177),
M165 = c(0.00072, 0.00073, 0.00072, 0.00074, 0.00083), M166 = c(0.00067,
0.00055, 0.00035, 0.00012, -2e-05), M167 = c(0.00068, 0.00053,
0.00047, 0.00051, 0.00059), M168 = c(0.00067, 0.00092, 0.001,
0.00087, 0.00067), M169 = c(0.00124, 0.00107, 0.00101, 0.00108,
0.00118), M170 = c(0.00054, 0.00064, 0.00069, 0.00066, 0.00053
), M171 = c(0.00029, 3e-04, 3e-04, 0.00031, 3e-04), M172 = c(0.00085,
0.00091, 0.00082, 0.00063, 0.00052), M173 = c(0.00022, 0.00036,
0.00053, 0.00061, 0.00056), M174 = c(5e-04, 0.00031, 0.00021,
0.00023, 0.00031), M175 = c(0.00074, 0.00066, 0.00059, 0.00051,
0.00043), M176 = c(9e-04, 0.00062, 0.00044, 0.00039, 0.00039
), M177 = c(0.00045, 0.00038, 0.00033, 0.00035, 0.00043),
M178 = c(0.00075, 0.00092, 0.00097, 0.00086, 0.00067), M179 = c(0.00047,
0.00033, 0.00026, 3e-04, 0.00037), M180 = c(0.00083, 0.00077,
0.00074, 0.00074, 7e-04), M181 = c(0.0013, 0.00138, 0.00137,
0.00127, 0.00109), M182 = c(0.00062, 0.00049, 0.00043, 0.00042,
0.00038), M183 = c(0.00056, 4e-04, 0.00034, 0.00046, 0.00065
), M184 = c(0.00122, 0.00116, 0.00096, 0.00067, 0.00039),
M185 = c(0.00045, 0.00026, 0.00012, 1e-04, 0.00024), M187 = c(0.00078,
0.00038, 8e-05, 0, 0.00014)), row.names = c(NA, 5L), class = "data.frame")
I have a program which reads a 10 MByte file and processes the data as the data is being read in 4K chunks. The test usually takes 1 min - 2 min. But there are some instances when the program takes more than 10 min, at which point the test it killed and a core is generated. Following is the code that reads the file:
string filename("data.out");
ifstream ifs;
vector<char> buf(4096);
ifs.open(filename, ios::in | ios::binary);
if (!ifs.is_open()) {
cout << "ERROR : " << filename << "can't be opened." << endl;
VERIFY(ifs.is_open());
}
while (!ifs.eof()) {
ifs.read(buf.data(), buf.size()); <======== Line 1
process_data (buf.data(), ifs.gcount()); <======== Line 2
}
ifs.close();
I have two cores that show the program is stuck at Line 1 and Line 2.
Top of bt of core1 at Line 1:
#0 0x00007f942a462175 in std::istream::read (this=0x7fff4ce69de0,
__s=0x9120000 "\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324\324"..., __n=4096) at /home/packages/gcc/4.7/w/gcc-4.7-4.7.2/build/x86_64-linux-gnu/libstdc++-v3/include/bits/istream.tcc:651
Top of bt of core2 at Line 2:
#0 0x00000000004375f3 in std::__addressof<char> (__r=#0x7fa3176391a6: -128 '\200') at /usr/include/c++/4.7/bits/move.h:47
#1 0x0000000000436cd4 in std::vector<char, std::allocator<char> >::data (this=0x7fff346ad770)
at /usr/include/c++/4.7/bits/stl_vector.h:859
Initially, from core1, I thought the issue was with ifs.read() taking a long time. But after the second core, I am thinking the issue might be related to vector::data().
Is there a way I can tell if any part of the file has been read, by inspecting certain fields (e.g file offset) stored in ifstream.
I don't like posting dump of large structure, but here it is if someone can shed some light how I can figure out from this dump how much of the 10MB has been read.
(gdb) p ifs
$3 = warning: can't find linker symbol for virtual table for `std::basic_ifstream<char, std::char_traits<char> >' value
{
<std::basic_istream<char, std::char_traits<char> >> = {
<std::basic_ios<char, std::char_traits<char> >> = {
<std::ios_base> = {
_vptr.ios_base = 0xfbfcc0,
static boolalpha = std::_S_boolalpha,
static dec = std::_S_dec,
static fixed = std::_S_fixed,
static hex = std::_S_hex,
static internal = std::_S_internal,
static left = std::_S_left,
static oct = std::_S_oct,
static right = std::_S_right,
static scientific = std::_S_scientific,
static showbase = std::_S_showbase,
static showpoint = std::_S_showpoint,
static showpos = std::_S_showpos,
static skipws = std::_S_skipws,
static unitbuf = std::_S_unitbuf,
static uppercase = std::_S_uppercase,
static adjustfield = std::_S_adjustfield,
static basefield = std::_S_basefield,
static floatfield = std::_S_floatfield,
static badbit = std::_S_badbit,
static eofbit = std::_S_eofbit,
static failbit = std::_S_failbit,
static goodbit = std::_S_goodbit,
static app = std::_S_app,
static ate = std::_S_ate,
static binary = std::_S_bin,
static in = std::_S_in,
static out = std::_S_out,
static trunc = std::_S_trunc,
static beg = std::_S_beg,
static cur = std::_S_cur,
static end = std::_S_end,
_M_precision = 6,
_M_width = 0,
_M_flags = 4098,
_M_exception = std::_S_goodbit,
_M_streambuf_state = 5,
_M_callbacks = 0x0,
_M_word_zero = {
_M_pword = 0x0,
_M_iword = 0
},
_M_local_word = {{
_M_pword = 0x0,
_M_iword = 0
}, {
_M_pword = 0x0,
_M_iword = 0
}, {
_M_pword = 0x0,
_M_iword = 0
}, {
_M_pword = 0x0,
_M_iword = 0
}, {
_M_pword = 0x0,
_M_iword = 0
}, {
_M_pword = 0x0,
_M_iword = 0
}, {
_M_pword = 0x0,
_M_iword = 0
}, {
_M_pword = 0x0,
_M_iword = 0
}},
_M_word_size = 8,
_M_word = 0x7fff4ce69f20,
_M_ios_locale = {
static none = 0,
static ctype = 1,
static numeric = 2,
static collate = 4,
static time = 8,
static monetary = 16,
static messages = 32,
static all = 63,
_M_impl = 0x7f942a6e3aa0,
static _S_classic = 0x7f942a6e3aa0,
static _S_global = 0x7f942a6e3aa0,
static _S_categories = 0x7f942a6c86a0,
static _S_once = 2
}
},
members of std::basic_ios<char, std::char_traits<char> >:
_M_tie = 0x0,
_M_fill = 0 '\000',
_M_fill_init = false,
_M_streambuf = 0x7fff4ce69df0,
_M_ctype = 0x7f942a6e3d20,
_M_num_put = 0x7f942a6e4040,
_M_num_get = 0x7f942a6e4030
},
members of std::basic_istream<char, std::char_traits<char> >:
_vptr.basic_istream = 0xfbfc98,
_M_gcount = 0
},
members of std::basic_ifstream<char, std::char_traits<char> >:
_M_filebuf = warning: can't find linker symbol for virtual table for `std::basic_filebuf<char, std::char_traits<char> >' value
{
<std::basic_streambuf<char, std::char_traits<char> >> = {
_vptr.basic_streambuf = 0xfc0a70,
_M_in_beg = 0x6306000 "\317\317\317\......320\320\320\320"...,
_M_in_cur = 0x6307fff "",
_M_in_end = 0x6307fff "",
_M_out_beg = 0x0,
_M_out_cur = 0x0,
_M_out_end = 0x0,
_M_buf_locale = {
static none = 0,
static ctype = 1,
static numeric = 2,
static collate = 4,
static time = 8,
static monetary = 16,
static messages = 32,
static all = 63,
_M_impl = 0x7f942a6e3aa0,
static _S_classic = 0x7f942a6e3aa0,
static _S_global = 0x7f942a6e3aa0,
static _S_categories = 0x7f942a6c86a0,
static _S_once = 2
}
},
members of std::basic_filebuf<char, std::char_traits<char> >:
_M_lock = {
__data = {
__lock = 0,
__count = 0,
__owner = 0,
__nusers = 0,
__kind = 0,
__spins = 0,
__list = {
__prev = 0x0,
__next = 0x0
}
},
__size = '\000' <repeats 39 times>,
__align = 0
},
_M_file = {
_M_cfile = 0x70186c0,
_M_cfile_created = true
},
_M_mode = 12,
_M_state_beg = {
__count = 0,
__value = {
__wch = 0,
__wchb = "\000\000\000"
}
},
_M_state_cur = {
__count = 0,
__value = {
__wch = 0,
__wchb = "\000\000\000"
}
},
_M_state_last = {
__count = 0,
__value = {
__wch = 0,
__wchb = "\000\000\000"
}
},
_M_buf = 0x6306000 "\317\317\317\317\317\......320\320\320\320\320"...,
_M_buf_size = 8192,
_M_buf_allocated = true,
_M_reading = true,
_M_writing = false,
_M_pback = 0 '\000',
_M_pback_cur_save = 0x0,
_M_pback_end_save = 0x0,
_M_pback_init = false,
_M_codecvt = 0x7f942a6e3f60,
_M_ext_buf = 0x0,
_M_ext_buf_size = 0,
_M_ext_next = 0x0,
_M_ext_end = 0x0
}
}
(gdb)
Thank you,
Ahmed.
Do not loop on eof.
while (ifs.read(buf.data(), buf.size())) {
size_t read = ifs.gcount();
if(read==0) break; // don't trust passing `0` to `process_data`:
process_data(buf.data(), read);
if (read<buf.size()) break; // if we finished, end.
}
Finding the end of input is best done by attempting io, and noticing something went wrong. In this case, we read, count how many bytes we read, and when we read 0 bytes or have read fewer bytes than we expected to read, we decide there isn't any more data to come.
We also end if any failbit has been set on ifs by the IO operation.