Assignment operator that calls a constructor is broken - c++

I've implemented some of the changes suggested in this question, and (thanks very much) it works quite well, however... in the process I've seemed to break the post-declaration assignment operator. With the following code:
#include <cstdio>
#include "ucpp"
main() {
ustring a = "test";
ustring b = "ing";
ustring c = "- -";
ustring d = "cafe\xcc\x81";
printf("%s\n", (a + b + c[1] + d).encode());
}
I get a nice "testing café" message. However, if I modify the code slightly so that the const char * conversion is done separately, post-declaration:
#include <cstdio>
#include "ucpp"
main() {
ustring a = "test";
ustring b = "ing";
ustring c = "- -";
ustring d;
d = "cafe\xcc\x81";
printf("%s\n", (a + b + c[1] + d).encode());
}
the ustring named d becomes blank, and all that is output is "testing ". My new code has three constructors, one void (which is probably the one being incorrectly used, and is used in the operator+ function), one that takes a const ustring &, and one that takes a const char *. The following is my new library code:
#include <cstdlib>
#include <cstring>
class ustring {
int * values;
long len;
public:
long length() {
return len;
}
ustring() {
len = 0;
values = (int *) malloc(0);
}
ustring(const ustring &input) {
len = input.len;
values = (int *) malloc(sizeof(int) * len);
for (long i = 0; i < len; i++)
values[i] = input.values[i];
}
ustring operator=(ustring input) {
ustring result(input);
return result;
}
ustring(const char * input) {
values = (int *) malloc(0);
long s = 0; // s = number of parsed chars
int a, b, c, d, contNeed = 0, cont = 0;
for (long i = 0; input[i]; i++)
if (input[i] < 0x80) { // ASCII, direct copy (00-7f)
values = (int *) realloc(values, sizeof(int) * ++s);
values[s - 1] = input[i];
} else if (input[i] < 0xc0) { // this is a continuation (80-bf)
if (cont == contNeed) { // no need for continuation, use U+fffd
values = (int *) realloc(values, sizeof(int) * ++s);
values[s - 1] = 0xfffd;
}
cont = cont + 1;
values[s - 1] = values[s - 1] | ((input[i] & 0x3f) << ((contNeed - cont) * 6));
if (cont == contNeed) cont = contNeed = 0;
} else if (input[i] < 0xc2) { // invalid byte, use U+fffd (c0-c1)
values = (int *) realloc(values, sizeof(int) * ++s);
values[s - 1] = 0xfffd;
} else if (input[i] < 0xe0) { // start of 2-byte sequence (c2-df)
contNeed = 1;
values = (int *) realloc(values, sizeof(int) * ++s);
values[s - 1] = (input[i] & 0x1f) << 6;
} else if (input[i] < 0xf0) { // start of 3-byte sequence (e0-ef)
contNeed = 2;
values = (int *) realloc(values, sizeof(int) * ++s);
values[s - 1] = (input[i] & 0x0f) << 12;
} else if (input[i] < 0xf5) { // start of 4-byte sequence (f0-f4)
contNeed = 3;
values = (int *) realloc(values, sizeof(int) * ++s);
values[s - 1] = (input[i] & 0x07) << 18;
} else { // restricted or invalid (f5-ff)
values = (int *) realloc(values, sizeof(int) * ++s);
values[s - 1] = 0xfffd;
}
len = s;
}
ustring operator=(const char * input) {
ustring result(input);
return result;
}
ustring operator+(ustring input) {
ustring result;
result.len = len + input.len;
result.values = (int *) malloc(sizeof(int) * result.len);
for (long i = 0; i < len; i++)
result.values[i] = values[i];
for (long i = 0; i < input.len; i++)
result.values[i + len] = input.values[i];
return result;
}
ustring operator[](long index) {
ustring result;
result.len = 1;
result.values = (int *) malloc(sizeof(int));
result.values[0] = values[index];
return result;
}
char * encode() {
char * r = (char *) malloc(0);
long s = 0;
for (long i = 0; i < len; i++) {
if (values[i] < 0x80)
r = (char *) realloc(r, s + 1),
r[s + 0] = char(values[i]),
s += 1;
else if (values[i] < 0x800)
r = (char *) realloc(r, s + 2),
r[s + 0] = char(values[i] >> 6 | 0x60),
r[s + 1] = char(values[i] & 0x3f | 0x80),
s += 2;
else if (values[i] < 0x10000)
r = (char *) realloc(r, s + 3),
r[s + 0] = char(values[i] >> 12 | 0xe0),
r[s + 1] = char(values[i] >> 6 & 0x3f | 0x80),
r[s + 2] = char(values[i] & 0x3f | 0x80),
s += 3;
else
r = (char *) realloc(r, s + 4),
r[s + 0] = char(values[i] >> 18 | 0xf0),
r[s + 1] = char(values[i] >> 12 & 0x3f | 0x80),
r[s + 2] = char(values[i] >> 6 & 0x3f | 0x80),
r[s + 3] = char(values[i] & 0x3f | 0x80),
s += 4;
}
return r;
}
};

operator= should modify *this. The returned value (which you better make a reference) is only used in chaining situations:
a = b = c;
(a = b).foo();
//etc.

Both assignment operators are broken, e.g., this:
ustring operator=(const char * input) {
ustring result(input);
return result;
}
Does nothing to the target object. It just creates a local temporary and returns that. Write them like this instead:
ustring& operator=(ustring input) {
swap(input);
return *this;
}
ustring& operator=(const char * input) {
swap(ustring(input));
return *this;
}
void swap(ustring& s) {
int* tv = values; values = s.values; s.values = tv;
long tl = len; len = s.len; s.len = tl;
}

Related

Copy 80 bit hex number from char array to uint16_t vector or array

Say I have a text file containing the 80bit hex number
0xabcdef0123456789abcd
My C++ program reads that using fstream into a char array called buffer.
But then I want to store it in a uint16_t array such that:
uint16_t * key = {0xabcd, 0xef01, 0x2345, 0x6789, 0xabcd}
I have tried several approaches, but I continue to get decimal integers, for instance:
const std::size_t strLength = strlen(buffer);
std::vector<uint16_t> arr16bit((strLength / 2) + 1);
for (std::size_t i = 0; i < strLength; ++i)
{
arr16bit[i / 2] <<= 8;
arr16bit[i / 2] |= buffer[i];
}
Yields:
arr16bit = {24930, 25444, 25958, 12337, 12851}
There must be an easy way to do this that I'm just not seeing.
Here is the full solution I came up with based on the comments:
int hex_char_to_int(char c) {
if (int(c) < 58) //numbers
return c - 48;
else if (int(c) < 91) //capital letters
return c - 65 + 10;
else if (int(c) < 123) //lower case letters
return c - 97 + 10;
}
uint16_t ints_to_int16(int i0, int i1, int i2, int i3) {
return (i3 * 16 * 16 * 16) + (i2 * 16 * 16) + (i1 * 16) + i0;
}
void readKey() {
const int bufferSize = 25;
char buffer[bufferSize] = { NULL };
ifstream* pStream = new ifstream("key.txt");
if (pStream->is_open() == true)
{
pStream->read(buffer, bufferSize);
}
cout << buffer << endl;
const size_t strLength = strlen(buffer);
int* hex_to_int = new int[strLength - 2];
for (int i = 2; i < strLength; i++) {
hex_to_int[i - 2] = hex_char_to_int(buffer[i]);
}
cout << endl;
uint16_t* key16 = new uint16_t[5];
int j = 0;
for (int i = 0; i < 5; i++) {
key16[i] = ints_to_int16(hex_to_int[j++], hex_to_int[j++], hex_to_int[j++], hex_to_int[j++]);
cout << "0x" << hex << key16[i] << " ";
}
cout << endl;
}
This outputs:
0xabcdef0123456789abcd
0xabcd 0xef01 0x2345 0x6789 0xabcd

C++ base64decode returns junk data if it contains '\0'

We have an certificate file (binary) having '\0' in multiple places. While trying to decode using openssl it gives junk data while size used to be perfect.
Same code works perfectly if there is no '\0' in the base64encoded data
We tried achieving it using below code but still file is not readable
static const std::string base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
static inline bool is_base64(unsigned char c) {
return (isalnum(c) || (c == '+') || (c == '/'));
}
std::string base64_decode(std::string const& encoded_string) {
int in_len = encoded_string.size();
int i = 0;
int j = 0;
int in_ = 0;
int in_1 = 0;
unsigned char char_array_4[4], char_array_3[3];
std::string ret;
std::ofstream outfile;
outfile.open("output_file.pfx", std::ios::binary | std::ios::out);
bool f = isalnum(encoded_string[in_]);
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
char_array_4[i++] = encoded_string[in_]; in_++;
if (i == 4) {
for (i = 0; i < 4; i++)
{
char_array_4[i] = base64_chars.find(char_array_4[i]);
}
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (i = 0; (i < 3); i++)
{
if (char_array_3[i] != NULL)
{
ret += char_array_3[i];
char val = char_array_3[i];
outfile.write(&val, sizeof(char));
}
else
{
/*char str3[3155];
strcpy(str3, ret.c_str());
ret = "";
ret.append(str3, sizeof(str3));*/
ret += "NUL";
char val111 = char_array_3[i];
outfile.write(&val111, sizeof(char));
}
}
i = 0;
}
}
if (i) {
for (j = 0; j < i; j++)
char_array_4[j] = base64_chars.find(char_array_4[j]);
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
for (j = 0; (j < i - 1); j++)
{
if (char_array_3[i] != NULL)
{
ret += char_array_3[i];
char val1 = char_array_3[i];
outfile.write(&val1, sizeof(char));
}
else
{
ret += "NUL";
char val11 = char_array_3[i];
outfile.write(&val11, sizeof(char));
}
}//ret += char_array_3[j];
}
outfile.close();
return ret;
}
int main()
{
base64_decode("U4tR8mRzyrqpNhhFjkbEe4I6LTYXhL7PxkbddNTQ1yZ6ofZ4s1R/UOQsq6x+CNxB+yddPirwT0yPgtm6IC1qYF9GGQsOqXHkpTrmXf0GiXDVpm91EMnyxtMu74B3OMIYgxmjoeua7HoKQkW6/GRuCpgWIoZQq7uOaKIsc3k9HGgfAFk6vTGER1YJlG28lOhsiGccl0EqD0uhrBGNhFERfAzB2gaJjI1oRO87Q2NbevKHeZycpyXgazvtw9JigA+Hp3+Cy9LUIRvF6k5uv0DKxOs5cynqYslb1LfKqT0IvLjBl4gNHl+pG5/Ur70XzZTiO1+n5jWITPoslZ4slVkl4qiTaqNWHgLT6aSUhWwPlvK+7wlk+st5ykAuSIE2e3Lia+omBRH2LQfG1v7KaOJApF3k4D0li/4QWOJ3zLwBDHB6WCwMQfNS8vTRWM1yIO/o9417wJEpBlcr/B308vGheoTF9+qRKGDe0M5PNHeBbEHhgNkLsKvcS/31HK6Xd36cg85yvyLghQRr9Gyn7TUU5m6f6iSlx3u+yo1vT7BBV6OjbxPklwCIYCZWIIOJq10JXC+bSGPbTKZYXjQW90URKesUOMi9s+DS7BKVEr471AnEyazividrgivfHDNWQisIcOctpDFCfEBAa28PYjIj4KJo5bDkSluRVcVDJVrP2Ns=");
return 0;
}
There is a bug in the handling of the trailing bytes, you are using i as the array index instead of j. As i can be larger than the size of char_array_3 it produces undefined behaviour. The correct code is:
for (j = 0; (j < i - 1); j++)
{
if (char_array_3[j] != NULL)
{
ret += char_array_3[j];
char val1 = char_array_3[j];
outfile.write(&val1, sizeof(char));
}
else
{
ret += "NUL";
char val11 = char_array_3[j];
outfile.write(&val11, sizeof(char));
}
}

Implicit conversion or cast?

I have a function that interleaves the bits of 32 bit words and returns a 64 bit result. For this simple test case, the bottom 3 bytes are correct, and the contents of the top 5 bytes are incorrect. intToBin_32 and intToBin_64 are convenience functions to see the binary representation of the arguments and return val. I've placed casts from the 32 bit type to the 64 bit type everywhere I think they are needed, but I'm still seeing this unexpected (to me, at least) behavior. Is there an implicit conversion going on here, or is there some other reason this doesn't work correctly?
#include <stdint.h>
#include <stdio.h>
struct intString_32 {char bstr [32 + 1 + 8];};
struct intString_64 { char bstr [64 + 1 + 8];};
intString_32 intToBin_32(int a)
{
intString_32 b;
for (int i = 0; i < 8; i++)
{
for (int j = 0; j < 5; j++)
{
if (j != 4)
{
b.bstr[5*i + j] = * ((a & (1 << (31 - (4*i + j)))) ? "1" : "0");
}
else
{
b.bstr[5*i + j] = 0x20;
}
}
}
b.bstr[40] = * ( "\0" );
return b;
}
intString_64 intToBin_64(long long a)
{
intString_64 b;
for (int i = 0; i < 8; i++)
{
for (int j = 0; j < 9; j++)
{
if (j != 8)
{
b.bstr[9*i + j] = * ((a & (1 << (63 - (8*i + j)))) ? "1" : "0");
}
else
{
b.bstr[9*i + j] = 0x20;
}
}
}
b.bstr[72] = * ( "\0" );
return b;
}
uint64_t interleaveBits(unsigned int a, unsigned int b)
{
uint64_t retVal = 0;
for (unsigned int i = 0; i < 32; i++)
{
retVal |= (uint64_t)((uint64_t)((a >> i) & 0x1)) << (2*i);
retVal |= (uint64_t)((uint64_t)((b >> i) & 0x1)) << (2*i + 1);
}
return retVal;
}
int main(int arc, char* argv)
{
unsigned int foo = 0x0004EDC7;
unsigned int bar = 0x5A5A00FF;
uint64_t bat = interleaveBits(foo, bar);
printf("foo: %s \n", intToBin_32(foo).bstr);
printf("bar: %s \n", intToBin_32(bar).bstr);
printf("bat: %s \n\n", intToBin_64(bat).bstr);
}
Through debugging I noticed it's your intToBin_64 which is wrong, to be specific, in this line:
b.bstr[9*i + j] = * ((a & (1 << (63 - (8*i + j)))) ? "1" : "0");
take a closer look on the shift:
(1 << (63 - (8*i + j)))
The literal 1 is a integer, however, shifting a integer by more than 31 bits is undefined behavior. Shift a longlong instead:
b.bstr[9*i + j] = * ((a & (1ll << (63 - (8*i + j)))) ? "1" : "0");

A memory-efficient SHA1 implementation

I'm working with a very restrictive embedded processor, which only has 128 bytes of ram. I'd like to implement SHA1 on it. RFC3174 describes, in 'method 2', a way of implementing SHA1 that doesn't require allocating an array of 80 32-bit words (which, at 320 bytes, is obviously not practical), and seems like it ought to be usable on my processor. I'm unable to find any implementations of 'method 2', though, and the sample code in the RFC only implements the default method.
Is anyone aware of a memory-efficient implementation of SHA1 in C or C++?
You should be able to quickly adapt the method 1 source to method 2. The function to change is Sha1ProcessMessageBlock() in method 1. Initialize w[0:15] from message, then do a loop of 0 to 79, where you only do w[] manipulation after iteration 16, and temp calculation depends on ts value (0-19 uses one, 20-39 uses another, etc). The important thing to remember is using index%16 or index & 0x0f whenever you are addressing the w[] array.
A quick modification would be something like this (double check all accesses to w to make sure I haven't missed the t & 0x0f):
void SHA1ProcessMessageBlock(SHA1Context *context)
{
const uint32_t K[] = { /* Constants defined in SHA-1 */
0x5A827999,
0x6ED9EBA1,
0x8F1BBCDC,
0xCA62C1D6
};
int t; /* Loop counter */
uint32_t temp; /* Temporary word value */
uint32_t W[16]; /* Word sequence */
uint32_t A, B, C, D, E; /* Word buffers */
/*
* Initialize the first 16 words in the array W. You can move this to your
* context.
*/
for(t = 0; t < 16; t++)
{
W[t] = context->Message_Block[t * 4] << 24;
W[t] |= context->Message_Block[t * 4 + 1] << 16;
W[t] |= context->Message_Block[t * 4 + 2] << 8;
W[t] |= context->Message_Block[t * 4 + 3];
}
A = context->Intermediate_Hash[0];
B = context->Intermediate_Hash[1];
C = context->Intermediate_Hash[2];
D = context->Intermediate_Hash[3];
E = context->Intermediate_Hash[4];
for(t = 0; t < 80; t++) {
if (t >= 16) {
W[t&0xf] = SHA1CircularShift(1,W[(t-3)&0xf] ^ W[(t-8)&0xf] ^ W[(t-14)&0xf] ^ W[t&0xf]);
}
if (t<20) {
temp = SHA1CircularShift(5,A) +
((B & C) | ((~B) & D)) + E + W[t&0xf] + K[0];
}
else if (t<40) {
temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t&0xf] + K[1];
}
else if (t < 60) {
temp = SHA1CircularShift(5,A) +
((B & C) | (B & D) | (C & D)) + E + W[t&0xf] + K[2];
}
else {
temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t&0xf] + K[3];
}
E = D;
D = C;
C = SHA1CircularShift(30,B);
B = A;
A = temp;
}
context->Intermediate_Hash[0] += A;
context->Intermediate_Hash[1] += B;
context->Intermediate_Hash[2] += C;
context->Intermediate_Hash[3] += D;
context->Intermediate_Hash[4] += E;
context->Message_Block_Index = 0;
}
There are still savings to be made: get rid of W[] array on stack and put it in context pre-initialized with the data you get.
Also, you need a lot of pre-processing before calling this function. For example, if all your messages are less than 55 bytes, you can put it in W array, add padding, and process immediately. If not, you'll have to call process twice: first with your partially padded input, and again with the rest of the pad, etc. That sort of thing would be very application specific, and I doubt you'll be able to find the code to do it for you.
By the way, the code above is a straight adaptation from the type 1 source from your link. You can probably squeeze a bit more out of it if you try to optimize it further.
I couldn't think of a way to get any savings on the intermediate hash, so you will need a total of 108 bytes for this (109 if counter is also in RAM), and 24 of which is local to this function, and can be reused in other places - so long as they are also temporary. So it is very hard to do what you want to do.
EDIT: If all your messages are less than 55 bytes, you can save another 20 bytes in your context by getting rid of the intermediate_hash[] storage. Simply initialize A-E from the constants, and add the constants at the end. Finally, instead of storing them in a separate variable, overwrite your input when this function ends.
I have implemented SHA-1 for several memory-constrained environments. You can get by with
DWORD W[16] ; // instead of H[80]
DWORD H[5] ; // Intermediate hash value
DWORD BitCount[2] ; // Probably a single DWORD is enough here
plus a few bytes of housekeeping. W is updated on the fly, as a circular buffer, instead of being generated at the start of each round.
working example:
#include<iostream>
#include<stdio.h>
#include<stdlib.h>
#include<string>
using namespace std;
unsigned CircularShift(int bits, unsigned word)
{
return ((word << bits) & 0xFFFFFFFF) | ((word & 0xFFFFFFFF) >> (32-bits));
}
int main(void)
{
string mess;
cin >> mess;
unsigned int lm = mess.length();
unsigned int lmb = lm*8;
unsigned char *messc;
messc=(unsigned char*)malloc((sizeof(unsigned char))*64);
for (unsigned short int i =0;i<64;i++)
{
messc[i]=char(0x00);
}
for(int i=0;i<mess.length();i++)
{
messc[i]=mess[i];
}
messc[lm]=(unsigned char)128;
messc[56] = (lmb >> 24) & 0xFF;
messc[57] = (lmb >> 16) & 0xFF;
messc[58] = (lmb >> 8) & 0xFF;
// messc[59] = (lmb) & 0xFF;
messc[60] = (lmb >> 24) & 0xFF;
messc[61] = (lmb >> 16) & 0xFF;
messc[62] = (lmb >> 8) & 0xFF;
messc[63] = (lmb) & 0xFF;
for(int i =0 ;i<64;i++)
{
cout<< hex << (int)messc[i] << " ";
}
unsigned *H;
H=(unsigned*)malloc(5*sizeof(unsigned));
H[0] = 0x67452301;
H[1] = 0xEFCDAB89;
H[2] = 0x98BADCFE;
H[3] = 0x10325476;
H[4] = 0xC3D2E1F0;
const unsigned K[]={0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6};
int t;
unsigned temp;
unsigned *W;
unsigned A, B, C, D, E;
W=(unsigned*)malloc(80*sizeof(unsigned));
unsigned char *messh;
messh=(unsigned char*)malloc(64*sizeof(unsigned char));
int k;
for(t = 0; t < 16; t++)
{
W[t] = ((unsigned) messc[t * 4])<< 24; ;
W[t] |= ((unsigned) messc[t * 4 + 1])<< 16;
W[t] |= ((unsigned) messc[t * 4 + 2]) << 8;
W[t] |= ((unsigned) messc[t * 4 + 3]);
}
for(t = 16; t < 80; t++)
{
W[t] = CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]);
}
A = H[0];
B = H[1];
C = H[2];
D = H[3];
E = H[4];
for(t = 0; t < 20; t++)
{
temp = CircularShift(5,A) + ((B & C) | ((~B) & D)) + E + W[t] + K[0];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = CircularShift(30,B);
B = A;
A = temp;
}
for(t = 20; t < 40; t++)
{
temp = CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = CircularShift(30,B);
B = A;
A = temp;
}
for(t = 40; t < 60; t++)
{
temp = CircularShift(5,A) +
((B & C) | (B & D) | (C & D)) + E + W[t] + K[2];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = CircularShift(30,B);
B = A;
A = temp;
}
for(t = 60; t < 80; t++)
{
temp = CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = CircularShift(30,B);
B = A;
A = temp;
}
H[0] = (H[0] + A) & 0xFFFFFFFF;
H[1] = (H[1] + B) & 0xFFFFFFFF;
H[2] = (H[2] + C) & 0xFFFFFFFF;
H[3] = (H[3] + D) & 0xFFFFFFFF;
H[4] = (H[4] + E) & 0xFFFFFFFF;
cout <<"\nTHIS IS SHHHHHAAAAAAAAAAA\n";
for(int i=0;i<5;i++)
{
cout << hex << H[i] << " ";
}
//Message_Block_Index = 0;
}
All things considered, looking at your requirements, I think you are going to have to change your specs. Either a bigger chip, or a simpler algorithm. Even implementing SHA-1 (without HMAC) would be a challenge, but it should be doable.

Why can't I assign a scalar value to a class using shorthand, but instead declare it first, then set its value?

I am writing a UTF-8 library for C++ as an exercise as this is my first real-world C++ code. So far, I've implemented concatenation, character indexing, parsing and encoding UTF-8 in a class called "ustring". It looks like it's working, but two seemingly equivalent ways of declaring a new ustring behave differently. The first way:
ustring a;
a = "test";
works, and the overloaded "=" operator parses the string into the class (which stores the Unicode strings as an dynamically allocated int pointer). However, the following does not work:
ustring a = "test";
because I get the following error:
test.cpp:4: error: conversion from ‘const char [5]’ to non-scalar type ‘ustring’ requested
Is there a way to workaround this error? It probably is a problem with my code, though. The following is what I've written so far for the library:
#include <cstdlib>
#include <cstring>
class ustring {
int * values;
long len;
public:
long length() {
return len;
}
ustring * operator=(ustring input) {
len = input.len;
values = (int *) malloc(sizeof(int) * len);
for (long i = 0; i < len; i++)
values[i] = input.values[i];
return this;
}
ustring * operator=(char input[]) {
len = sizeof(input);
values = (int *) malloc(0);
long s = 0; // s = number of parsed chars
int a, b, c, d, contNeed = 0, cont = 0;
for (long i = 0; i < sizeof(input); i++)
if (input[i] < 0x80) { // ASCII, direct copy (00-7f)
values = (int *) realloc(values, sizeof(int) * ++s);
values[s - 1] = input[i];
} else if (input[i] < 0xc0) { // this is a continuation (80-bf)
if (cont == contNeed) { // no need for continuation, use U+fffd
values = (int *) realloc(values, sizeof(int) * ++s);
values[s - 1] = 0xfffd;
}
cont = cont + 1;
values[s - 1] = values[s - 1] | ((input[i] & 0x3f) << ((contNeed - cont) * 6));
if (cont == contNeed) cont = contNeed = 0;
} else if (input[i] < 0xc2) { // invalid byte, use U+fffd (c0-c1)
values = (int *) realloc(values, sizeof(int) * ++s);
values[s - 1] = 0xfffd;
} else if (input[i] < 0xe0) { // start of 2-byte sequence (c2-df)
contNeed = 1;
values = (int *) realloc(values, sizeof(int) * ++s);
values[s - 1] = (input[i] & 0x1f) << 6;
} else if (input[i] < 0xf0) { // start of 3-byte sequence (e0-ef)
contNeed = 2;
values = (int *) realloc(values, sizeof(int) * ++s);
values[s - 1] = (input[i] & 0x0f) << 12;
} else if (input[i] < 0xf5) { // start of 4-byte sequence (f0-f4)
contNeed = 3;
values = (int *) realloc(values, sizeof(int) * ++s);
values[s - 1] = (input[i] & 0x07) << 18;
} else { // restricted or invalid (f5-ff)
values = (int *) realloc(values, sizeof(int) * ++s);
values[s - 1] = 0xfffd;
}
return this;
}
ustring operator+(ustring input) {
ustring result;
result.len = len + input.len;
result.values = (int *) malloc(sizeof(int) * result.len);
for (long i = 0; i < len; i++)
result.values[i] = values[i];
for (long i = 0; i < input.len; i++)
result.values[i + len] = input.values[i];
return result;
}
ustring operator[](long index) {
ustring result;
result.len = 1;
result.values = (int *) malloc(sizeof(int));
result.values[0] = values[index];
return result;
}
char * encode() {
char * r = (char *) malloc(0);
long s = 0;
for (long i = 0; i < len; i++) {
if (values[i] < 0x80)
r = (char *) realloc(r, s + 1),
r[s + 0] = char(values[i]),
s += 1;
else if (values[i] < 0x800)
r = (char *) realloc(r, s + 2),
r[s + 0] = char(values[i] >> 6 | 0x60),
r[s + 1] = char(values[i] & 0x3f | 0x80),
s += 2;
else if (values[i] < 0x10000)
r = (char *) realloc(r, s + 3),
r[s + 0] = char(values[i] >> 12 | 0xe0),
r[s + 1] = char(values[i] >> 6 & 0x3f | 0x80),
r[s + 2] = char(values[i] & 0x3f | 0x80),
s += 3;
else
r = (char *) realloc(r, s + 4),
r[s + 0] = char(values[i] >> 18 | 0xf0),
r[s + 1] = char(values[i] >> 12 & 0x3f | 0x80),
r[s + 2] = char(values[i] >> 6 & 0x3f | 0x80),
r[s + 3] = char(values[i] & 0x3f | 0x80),
s += 4;
}
return r;
}
};
Your problem is that ustring a = "test" actually invokes the constructor, not the assignment operator. yay, welcome to c++ :)
You'll need to define yourself both a default constructor and one that takes a const char*, because once you define a constructor, you need to define all your constructors.
A few other things:
pass your input ustring by reference
pass const char * instead of char[] (you don't modify the input and char* is more common)
sizeof isn't doing what you think it's doing, it doesn't work properly for array parameters. It is returning you sizeof(char*), not sizeof(array).
return reference to this from your operators.
you can use vector<int> values; to manage all your memory for you.
encode() should probably return a string. With string:
it manages its own memory, so the caller doesn't need to free or delete it.
you can use s.append(c); instead of using realloc.
you can use printf("%s", s.c_str());, but in c++ you usually use cout << s;
consider defining a copy constructor as well.
Like this:
class ustring {
public:
// Default constructor, allows you to create your class with no arguments.
ustring() { ...; }
// Allows you to create your class from string literals.
ustring(const char *input) { ...; }
// Copy constructor, allows you to create your class from other instances.
ustring(const ustring &input) { ...; }
// Assignment operators.
ustring &operator=(const ustring &input) { ...; return *this; }
ustring &operator=(const char *input) { ...; return *this; }
};
int main() {
ustring s, t; // invokes default constructor.
s = t; // invokes ustring assignment op.
s = "test"; // invokes const char* assignment op.
ustring u = "test"; // invokes const char* constructor.
ustring v("test"); // invokes const char* constructor.
ustring x(u); // invokes copy constructor.
}
If this is c++, why are you doing all this malloc/realloc stuff? I haven't fully parsed that code, but I'd imagine there's a simpler way... see the comment about using vector.
As #Michael Aaron Safyan mentioned in the comments, if you do any memory allocation for the ustring class, you will want to deallocate it in the destructor. However, I think by switching to memory managed containers - vector & string - you'll avoid any of your own memory management and can avoid writing a destructor.
These are two operations:
ustring a; // construct a new object using constructor
a = "test"; // assign value to object using operator=
This is one operation:
ustring a = "test"; // construct with a value, aka value-intialization
In the interest of runtime efficiency and allowing semantic freedom, C++ does not extrapolate the default constructor ustring::ustring() and the assignment operator ustring::operator=(const char*) into a constructor ustring::ustring(const char *).
But, for most reasonable string classes, this will work:
ustring::ustring(const char *str)
: /* initialize ustring::ustring() does */ {
/* do whatever ustring::ustring() does */
*this = str; // assign value.
}
It is better to call the assignment operator from the constructor than to attempt it the other way around.
Of course, you can probably improve efficiency by considering the length of the given string while performing initialization.