I am a beginner programmer trying to inflate text stream from pdfs. I have adopted and slightly altered some open source code which uses zlib, and generally it works very well. However, I have been testing on some different pdfs lately and some of the inflated streams are returning blank. Could anybody advise me as to why?
I have come across this question below which seems to address the same problem but does not really give a definitive answer
zLib inflate has empty result in some cases
#include <iostream>
#include <fstream>
#include <string>
#include "zlib.h"
int main()
{
//Discard existing output:
//Open the PDF source file:
std::ifstream filei("C:\\Users\\dpbowe\\Desktop\\PIDSearch\\P&ID.PDF", std::ios::in|std::ios::binary|std::ios::ate);
if (!filei) std::cout << "Error Opening Input File" << std::endl;
//decoded output
std::ofstream fileo;
fileo.open("C:\\Users\\dpbowe\\Desktop\\Decoded.txt", std::ios::binary | std::ofstream::out);
if (!fileother) std::cout << "Error opening output file" << std::endl;
if (filei && fileo)
{
//Get the file length:
long filelen = filei.tellg(); //fseek==0 if ok
filei.seekg(0, std::ios::beg);
//Read the entire file into memory (!):
char* buffer = new char [filelen];
if (buffer == NULL) {fputs("Memory error", stderr); exit(EXIT_FAILURE);}
filei.read(buffer,filelen);
if (buffer == '\0') {fputs("Reading error", stderr); exit(EXIT_FAILURE);}
bool morestreams = true;
//Now search the buffer repeated for streams of data
while (morestreams)
{
//Search for stream, endstream. Should check the filter of the object to make sure it if FlateDecode, but skip that for now!
size_t streamstart = FindStringInBuffer (buffer, "stream", filelen); //This is my own search function
size_t streamend = FindStringInBuffer (buffer, "endstream", filelen); //This is my own search function
if (streamstart>0 && streamend>streamstart)
{
//Skip to beginning and end of the data stream:
streamstart += 6;
if (buffer[streamstart]==0x0d && buffer[streamstart+1]==0x0a) streamstart+=2;
else if (buffer[streamstart]==0x0a) streamstart++;
if (buffer[streamend-2]==0x0d && buffer[streamend-1]==0x0a) streamend-=2;
else if (buffer[streamend-1]==0x0a) streamend--;
//Assume output will fit into 10 times input buffer:
size_t outsize = (streamend - streamstart)*10;
char* output = new char [outsize]; ZeroMemory(output, outsize);
//Now use zlib to inflate:
z_stream zstrm; ZeroMemory(&zstrm, sizeof(zstrm));
zstrm.avail_in = streamend - streamstart + 1;
zstrm.avail_out = outsize;
zstrm.next_in = (Bytef*)(buffer + streamstart);
zstrm.next_out = (Bytef*)output;
int rsti = inflateInit(&zstrm);
if (rsti == Z_OK)
{
int rst2 = inflate (&zstrm, Z_FINISH);
if (rst2 >= 0)
{
size_t totout = zstrm.total_out;
//Write inflated output to file "Decoded.txt"
fileother<<output;
fileother<<"\r\nStream End\r\n\r\n";
}
else std::cout<<"output uncompressed stream is blank"<<std::endl;
}
delete[] output; output=0;
buffer+= streamend + 7;
filelen = filelen - (streamend+7);
}
else
{
morestreams = false;
std::cout<<"End of File"<<std::endl;
}
}
filei.close();
}
else
{
std::cout << "File Could Not Be Accessed\n";
}
if (fileo) fileo.close();
}
Related
I am a beginner, so I apologise if my question looks childish. I have 38 large files in a folder. I want to split each of the files into smaller parts with dynamic name. Line 1 to line 13 works well. The challenge is in line 16-19. The output shows that the whole data from the ifstream is not appearing as char. This error makes it difficult to split the files. Please what am I getting wrong
#define SEGMENT 728300 //approximate target size of small file
using namespace std;
long file_size(char *name);//function definition below
int main(int argc, char **argv)
{
char input_file_1[100]; // input file
strcpy(input_file_1,argv[1]);
string PathToData = "path to the files";
TString name = PathToData+input_file_1;
std::cout << "Reading file " << name << endl;
char getdata[35000];
ifstream csv_db(name);
while(csv_db.getline(getdata,sizeof(csv_db)))
if (csv_db.eof())
csv_db.close();
int segments=0, i, accum;
FILE *fp1, *fp2;
unsigned int huga=strlen(getdata);
char largeFileName[huga + 100]; // Make sure there's enough space
strcpy(largeFileName, getdata);
std::cout << largeFileName << endl;
std::cout << largeFileName << endl;
long sizeFile = file_size(largeFileName);
segments = sizeFile/SEGMENT + 1980;//ensure end of file
char filename[360]={"path to folder where to keep the result"};
char smallFileName[360];
char line[1080];
fp1 = fopen(largeFileName, "r");
if(fp1)
{
for(i=1980;i<segments;i++)
{
accum = 0;
sprintf(smallFileName, "%s%d.csv", filename, i);
fp2 = fopen(smallFileName, "w");
if(fp2)
{
while(fgets(line, 1080, fp1) && accum <= SEGMENT)
{
accum += strlen(line);//track size of growing file
fputs(line, fp2);
}
fclose(fp2);
}
}
fclose(fp1);
}
return 0;
}
long file_size(char *name)
{
FILE *fp = fopen(name, "rb"); //must be binary read to get bytes
long size=-1;
if(fp)
{
fseek (fp, 0, SEEK_END);
size = ftell(fp)+1;
fclose(fp);
}
return size;
}
I'm creating an FTP client.
I'm getting a gif from the server, but after that the gif is corrupted.
When I change the file extension to look at the diff, I see that the
CR/LF characters are gone.
How could this be? I made sure to use image mode.
Here's my read code in TCP socket.
string TCPSocket::long_read()
{
pollfd ufds;
ufds.fd = sd;
ufds.events = POLLIN;
ufds.revents = 0;
ssize_t bytesRead = 0;
string result;
char* buf = new char[LONGBUFLEN];
do {
bzero(buf, LONGBUFLEN);
bytesRead = ::read(sd, buf, LONGBUFLEN);
if (bytesRead == 0) {
break;
}
if (bytesRead > 0) {
result = result + string(buf, bytesRead);
}
} while (poll(&ufds, 1, 1000) > 0);
return result;
}
Here my get code in main.cpp
else if (command == command::GET) {
string filename;
cin >> filename;
string dataHost;
int dataPort;
if (enterPassiveMode(dataHost, dataPort)) {
dataSocket = new TCPSocket(dataHost.c_str(), dataPort);
if (fork() == 0) {
string result = dataSocket->long_read();
size_t length = result.size();
char* resultArr = new char[length];
memcpy(resultArr, result.data(), length);
// mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
FILE* file = fopen(filename.c_str(), "w+b");
if (file) {
fwrite(resultArr, length, 1, file);
fclose(file);
}
else {
cout << "open failed";
}
break;
}
else {
writeAndImmediateRead(rfc959::TYPE_I);
controlSocket->write(rfc959::RETRIVE(filename));
string result = controlSocket->read();
cout << result;
int reply = Parser::firstDigit(result);
// I'll remove incomplete local file if request fails
if (reply != rfc959::POSITIVE_PRELIMINARY_REPLY) {
remove(filename.c_str());
continue;
}
wait(NULL);
cout << controlSocket->long_read();
}
}
}
EDIT
I did make sure to use Binary mode. And when I transferred a text file(though of a smaller size), it doesn't have this problem. Here's the output:
EDIT 2
Output from Wireshark showing Request: TYPE I and Response: Opening BINARY mode
By default, FTP servers and clients perform data transfers as "ASCII mode", which means that any CRLF sequence is translated on-the-fly to the host's ASCII line ending (e.g. just bare LF on Unix mmachines). This behavior is mandated by RFC 959; see Section 3.1.1.1.
To transfer your data as binary, and avoid the ASCII mode translation, your FTP client will want to send the TYPE command first, e.g.:
TYPE I
Your .gif file should then be transferred as is, with no replacements/transformations on any CRLF sequences.
Hope this helps!
I'm trying to implement the function from listing 5.1 here
but when copying into a buffer with read from a file I just get the same character (Í) for the whole array, where string.txt is a copy and paste from the previous link content.
Here is my code:
#include <iostream>
#include <fstream>
#include <string>
#include <cinttypes>
#include <cstdio>
#include <cstring>
const int block_size = 0x4000; //16KB
int search(char* buffer, int searchLength, char* stringToSearch, int stringToSearchLength) {
char * potentialMatch;
while (searchLength) {
potentialMatch = reinterpret_cast<char *>(memchr(buffer, *stringToSearch, searchLength));
if (potentialMatch == NULL)
break;
if (stringToSearchLength == 1) {
return 1;
} else {
if (!memcmp(potentialMatch + 1, stringToSearch + 1, stringToSearchLength - 1))
return 1;
}
searchLength -= potentialMatch - buffer + 1;
buffer = potentialMatch + 1;
}
return 0;
}
int main(int argc, char* argv[]) {
char *toSearch = "Interpreting Where";
int done = 0;
int found = 0;
char *buffer;
int64_t fileSizeLeft = 0;
std::ifstream myFile("string.txt");
if (!myFile.fail()) {
buffer = new char[block_size];
myFile.seekg(0, std::ios::end); //Get file's size
fileSizeLeft = myFile.tellg();
} else {
std::cout << "Cannot open file" << std::endl;
return 1;
}
int toSearchLength = strlen(toSearch);
int stringLeft = toSearchLength - 1;
int first_time = 1;
while (!done && fileSizeLeft > toSearchLength) {
if (first_time) {
myFile.read(buffer, block_size);
found = search(buffer, block_size, toSearch, toSearchLength);
} else {
memcpy(buffer, buffer + stringLeft, stringLeft);
myFile.read(buffer+stringLeft, fileSizeLeft-stringLeft);
found = search(buffer, block_size, toSearch, toSearchLength);
}
fileSizeLeft = fileSizeLeft - block_size;
first_time = 0;
}
if (found) {
std::cout << "String found" << std::endl;
} else {
std::cout << "String not found" << std::endl;
}
myFile.close();
delete[] buffer;
return 0;
}
I hope you can help me see what I'm doing wrong, thanks!
You are setting myFile's position to ios_base::end with seekg:
myFile.seekg(0, ios::end);
Then trying to read from it:
myFile.read(buffer, block_size);
Clearly no data will be read since myFile is already at ios_base::end. And you'll be reading whatever uninitialized data that was already in buffer
What you probably intended to do was to set your myFile position back to the beginning by doing this before reading:
myFile.seekg(0, ios::beg);
I have following piece of code that is supposed to calculate the SHA256 of a file. I am reading the file chunk by chunk and using EVP_DigestUpdate for the chunk. When I test the code with the file that has content
Test Message
Hello World
in Windows, it gives me SHA256 value of 97b2bc0cd1c3849436c6532d9c8de85456e1ce926d1e872a1e9b76a33183655f but the value is supposed to be 318b20b83a6730b928c46163a2a1cefee4466132731c95c39613acb547ccb715, which can be verified here too.
Here is the code:
#include <openssl\evp.h>
#include <iostream>
#include <string>
#include <fstream>
#include <cstdio>
const int MAX_BUFFER_SIZE = 1024;
std::string FileChecksum(std::string, std::string);
int main()
{
std::string checksum = FileChecksum("C:\\Users\\Dell\\Downloads\\somefile.txt","sha256");
std::cout << checksum << std::endl;
return 0;
}
std::string FileChecksum(std::string file_path, std::string algorithm)
{
EVP_MD_CTX *mdctx;
const EVP_MD *md;
unsigned char md_value[EVP_MAX_MD_SIZE];
int i;
unsigned int md_len;
OpenSSL_add_all_digests();
md = EVP_get_digestbyname(algorithm.c_str());
if(!md) {
printf("Unknown message digest %s\n",algorithm);
exit(1);
}
mdctx = EVP_MD_CTX_create();
std::ifstream readfile(file_path,std::ifstream::in|std::ifstream::binary);
if(!readfile.is_open())
{
std::cout << "COuldnot open file\n";
return 0;
}
readfile.seekg(0, std::ios::end);
long filelen = readfile.tellg();
std::cout << "LEN IS " << filelen << std::endl;
readfile.seekg(0, std::ios::beg);
if(filelen == -1)
{
std::cout << "Return Null \n";
return 0;
}
EVP_DigestInit_ex(mdctx, md, NULL);
long temp_fil = filelen;
while(!readfile.eof() && readfile.is_open() && temp_fil>0)
{
int bufferS = (temp_fil < MAX_BUFFER_SIZE) ? temp_fil : MAX_BUFFER_SIZE;
char *buffer = new char[bufferS+1];
buffer[bufferS] = 0;
readfile.read(buffer, bufferS);
std::cout << strlen(buffer) << std::endl;
EVP_DigestUpdate(mdctx, buffer, strlen(buffer));
temp_fil -= bufferS;
delete[] buffer;
}
EVP_DigestFinal_ex(mdctx, md_value, &md_len);
EVP_MD_CTX_destroy(mdctx);
printf("Digest is: ");
//char *checksum_msg = new char[md_len];
//int cx(0);
for(i = 0; i < md_len; i++)
{
//_snprintf(checksum_msg+cx,md_len-cx,"%02x",md_value[i]);
printf("%02x", md_value[i]);
}
//std::string res(checksum_msg);
//delete[] checksum_msg;
printf("\n");
/* Call this once before exit. */
EVP_cleanup();
return "";
}
I tried to write the hash generated by program as string using _snprintf but it didn't worked. How can I generate the correct hash and return the value as string from FileChecksum Function? Platform is Windows.
EDIT: It seems the problem was because of CRLF issue. As Windows in saving file using \r\n, the Checksum calculated was different. How to handle this?
MS-DOS used the CR-LF convention,So basically while saving the file in windows, \r\n comes in effect for carriage return and newline. And while testing on online (given by you), only \n character comes in effect.
Thus either you have to check the checksum of Test Message\r\nHello World\r\n in string which is equivalent to creating and reading file in windows(as given above), which is the case here.
However, the checksum of files,wherever created, will be same.
Note: your code works fine :)
It seems the problem was associated with the value of length I passed in EVP_DigestUpdate. I had passed value from strlen, but replacing it with bufferS did fixed the issue.
The code was modified as:
while(!readfile.eof() && readfile.is_open() && temp_fil>0)
{
int bufferS = (temp_fil < MAX_BUFFER_SIZE) ? temp_fil : MAX_BUFFER_SIZE;
char *buffer = new char[bufferS+1];
buffer[bufferS] = 0;
readfile.read(buffer, bufferS);
EVP_DigestUpdate(mdctx, buffer, bufferS);
temp_fil -= bufferS;
delete[] buffer;
}
and to send the checksum string, I modified the code as:
EVP_DigestFinal_ex(mdctx, md_value, &md_len);
EVP_MD_CTX_destroy(mdctx);
char str[128] = { 0 };
char *ptr = str;
std::string ret;
for(i = 0; i < md_len; i++)
{
//_snprintf(checksum_msg+cx,md_len-cx,"%02x",md_value[i]);
sprintf(ptr,"%02x", md_value[i]);
ptr += 2;
}
ret = str;
/* Call this once before exit. */
EVP_cleanup();
return ret;
As for the wrong checksum earlier, the problem was associated in how windows keeps the line feed. As suggested by Zangetsu, Windows was making text file as CRLF, but linux and the site I mentioned earlier was using LF. Thus there was difference in the checksum value. For files other than text, eg dll the code now computes correct checksum as string
I'm currently using libzip in a C++11 program to extract the contents of a compressed file and store them into a data structure that will also hold metadata related to the file.
I'm using the current method to explode the zip file and get the content of each file in it:
void explodeArchive(const string& path, vector<ZipFileModel>& files) {
int error = 0;
zip *zip = zip_open(path.c_str(), 0, &error);
if (zip == nullptr) {
throw logic_error("Could not extract content of file " + path);
}
const zip_int64_t n_entries = zip_get_num_entries(zip, ZIP_FL_UNCHANGED);
for (zip_int64_t i = 0; i < n_entries; i++) {
const char *file_name = zip_get_name(zip, i, ZIP_FL_ENC_GUESS);
struct zip_stat st;
zip_stat_init(&st);
zip_stat(zip, file_name, ZIP_FL_NOCASE, &st);
char *content = new char[st.size];
std::cerr << file_name << std::endl;
zip_file *file = zip_fopen(zip, file_name, ZIP_FL_NOCASE);
const zip_int64_t did_read = zip_fread(file, content, st.size);
if (did_read <= 0) {
continue;
}
if (strlen(content) < st.size) {
LOG(WARNING)<< "File " << file_name << " is truncated.";
}
if (strlen(content) > st.size) {
content[st.size] = '\0';
}
ZipFileModel model;
model.name = string(file_name);
model.content = string(content);
model.order = -1;
files.push_back(model);
zip_fclose(file);
delete[] content;
}
zip_close(zip);
}
My problem is that I get random segmentation faults with gdb pointing to zip_fclose(file);:
Program received signal SIGSEGV, Segmentation fault.
0x00000001001ef8a0 in zip_source_close (src=0x105001b00) at /Users/xxx/Projects/xxx/xxx/src/libzip/zip_source_close.c:48
48 (void)src->cb.l(src->src, src->ud, NULL, 0, ZIP_SOURCE_CLOSE);
What's the best way to debug this? As I said it happens intermittently so it's hard to pin down the exact cause.
You aren't closing the zip_file when there's nothing to read.
First you open the file inside:
zip_file *file = zip_fopen(zip, file_name, ZIP_FL_NOCASE);
Then try to read something:
const zip_int64_t did_read = zip_fread(file, content, st.size);
and if there's nothing to read you continue and the file is never closed.
if (did_read <= 0) {
continue;
}
So, just add:
if (did_read <= 0) {
zip_fclose(file);
continue;
}