I am making a web crawler
and I have the following code but the problem is that it also downloads binary data and I don't want that to happen. How do I prevent it
size_t HTML::WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
size_t realsize = size * nmemb;
if(contents!=NULL||userp!=NULL){
std::string* str=(std::string*)userp;
str->reserve(realsize);
auto c_str=(char*)contents;
if(c_str!=NULL){
for(size_t i=0;i<realsize;i++){
str->push_back(c_str[i]);
}
}
}
return realsize;
}
HTML_CODE HTML::get_html(std::string url) {
std::string chunk;
CURL *curl_handle=curl_easy_init();
CURLcode res;
if(curl_handle) {
curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk);
curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, USER_AGENT);
res = curl_easy_perform(curl_handle);
if(res != CURLE_OK) {
std::cout<<"Can't get html content from "<<url<<"\n";
fprintf(stderr, "error: %s\n", curl_easy_strerror(res));
return {"",""};
}
curl_easy_cleanup(curl_handle);
}
else{
std::cout<<"Error: Couldn't create a curl instance"<<std::endl;
return {"",""};
}
return {.url=url,.content=chunk};
}
Things I have tried:-
Check if the data has a null terminator
Check if the char is an assci letter(it wont work with other language)
Related
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 3 years ago.
Improve this question
Greetings everyone read this topic, my platform is win32. And I'm using libcurl with a problem.
My goal is to coding with libcurl for a download program, which it includes requesting a url to download a file, saving the file locally(fwrite), showing the progress bar while downloading.
The Problem is it can download the very small file well but when requesting a larger file like 30MB, it stops before it's done.
How can I debug this program to work well with any size of files?
I'm not familiar with libcurl, any simple detail could help. Can I have either answer of how curl_easy series works to call multiple callback functions, improper coding of either of the two callback functions, or some missing rules from libcurl?
Feel free to answer me anything.
Things I've tried:
1.I've tried re-compiling versions of libcurl. Now I'm using libcurl-7.64 compiled with "WITH_SSL=static".
2.I've tried many sites, finding the clue: the sites for very small(like 80kb) file will be downloaded completely with the progress bar. But larger file(like 30Mb) will be incomplete. One of my guess is it stopped from some transfer problem since the file is larger.
codes:
static FILE * fp;
static size_t write_callback(char *ptr, size_t size, size_t nmemb, void *userdata)
{
size_t nWrite = fwrite(ptr, size, nmemb, fp);
return nWrite;
}
static int progress_callback(void *clientp, curl_off_t dltotal, curl_off_t dlnow, curl_off_t ultotal, curl_off_t ulnow)
{
(void)ultotal;
(void)ulnow;
int totaldotz = 40;
double fractiondownloaded = (double)dlnow / (double)dltotal;
int dotz = (int)(fractiondownloaded * totaldotz);
printf("%3.0f%% [", fractiondownloaded * 100); //print the number percentage of the progress
int i = 0;
for (; i < dotz; i++) { //print "=" to show progress
printf("=");
}
for (; i < totaldotz; i++) { //print space to occupy the rest
printf(" ");
}
printf("]\r");
fflush(stdout);
return 0;
}
int download_function(CURL *curl,const char * url, const char * path)
{
curl = curl_easy_init();
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback);
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
fopen_s(&fp, path, "ab+");
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false);
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 3L);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L);
char * error = NULL;
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, error);
CURLcode retcCode = curl_easy_perform(curl);
fclose(fp);
const char* pError = curl_easy_strerror(retcCode);
if (curl) {
curl_easy_cleanup(curl);
}
return 0;
}
#ccxxshow seems right. Set the timeout option gives me CURLE_OPERATION_TIMEDOUT error.
After remove this line I can download about 9MB PDF file successfully.
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L);
My complete code:
#include <curl/curl.h>
static FILE * fp;
static size_t write_callback(char *ptr, size_t size, size_t nmemb, void *userdata)
{
size_t nWrite = fwrite(ptr, size, nmemb, fp);
return nWrite;
}
static int progress_callback(void *clientp, curl_off_t dltotal, curl_off_t dlnow, curl_off_t ultotal, curl_off_t ulnow)
{
(void)ultotal;
(void)ulnow;
int totaldotz = 40;
double fractiondownloaded = (double)dlnow / (double)dltotal;
int dotz = (int)(fractiondownloaded * totaldotz);
printf("%3.0f%% [", fractiondownloaded * 100); //print the number percentage of the progress
int i = 0;
for (; i < dotz; i++) { //print "=" to show progress
printf("=");
}
for (; i < totaldotz; i++) { //print space to occupy the rest
printf(" ");
}
printf("]\r");
fflush(stdout);
return 0;
}
int download_function(CURL *curl, const char * url, const char * path)
{
curl = curl_easy_init();
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback);
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
fopen_s(&fp, path, "ab+");
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false);
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 3L);
//curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L);
char * error = NULL;
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, error);
CURLcode retcCode = curl_easy_perform(curl);
fclose(fp);
const char* pError = curl_easy_strerror(retcCode);
if (curl) {
curl_easy_cleanup(curl);
}
return 0;
}
int main()
{
CURL *testCurl = NULL;
const char *fileAddr = "https://gotocon.com/dl/goto-cph-2015/slides/AndersLybecker_and_SebastianBrandes_DevelopingIoTSolutionsWithWindows10AndAzure.pdf";
download_function(testCurl, fileAddr, "my-9MB.pdf");
}
I am trying to download a .txt file from a server which I can access via the web browser on my raspberry pi.
Curl library gives segmentation error when I am trying to do this. Here is the code I am using.
size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) {
size_t written = fwrite(ptr, size, nmemb, stream);
return written;
}
int checkNewFiles(){
CURL *curl;
FILE *fp;
CURLcode res;
string url = "http://52.233.176.151:1880/files/device/software/text.txt";
char outfilename[FILENAME_MAX] = "/home/pi/Desktop/project/cpp/ab.txt";
curl = curl_easy_init();
if (curl) {
fp = fopen(outfilename, "wb");
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
res = curl_easy_perform(curl);
curl_easy_cleanup(curl);
fclose(fp);
}
return 0;
}
I found the problem, what is url.c_str() doing?
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
change this to
curl_easy_setopt(curl, CURLOPT_URL, url);
Example : Curl program that download the text file.
Offcourse you need to add this neccessary header file here.
size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) {
size_t written = fwrite(ptr, size, nmemb, stream);
return written;
}
int main(void) {
CURL *curl;
FILE *fp;
CURLcode res;
const char *url = "http://localhost/yourfile.txt";
char outfilename[FILENAME_MAX] = "C:\\outfile.txt";
curl = curl_easy_init();
if (curl) {
fp = fopen(outfilename,"wb");
curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1); /* enable failure on http errors */
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
res = curl_easy_perform(curl);
if(res != CURLE_OK) { /* check that the operation was successful */
printf("curl_easy_perform(): %s\n", curl_easy_strerror(res));
}
/* always cleanup */
curl_easy_cleanup(curl);
fclose(fp);
}
return 0;
}
I noticed you're not checking for errors after fopen. If it fails, it returns a NULL pointer, which would cause a segfault when curl attempts to write to it.
I'm not convinced that c_str() was the culprit to your segfault in the original question as I have used that in numerous applications with no problems.
I have this piece of code which sends email via libcurl
#define FROM "<my#gmail.com>"
#define TO "<your#gmail.com>"
static const char *payload_text[] = {
"To: " TO "\r\n",
"From: " FROM "\r\n",
"Subject: Shift\r\n",
"text\r\n",
NULL
};
struct upload_status {
int lines_read;
};
static size_t payload_source(void *ptr, size_t size, size_t nmemb, void *userp)
{
struct upload_status *upload_ctx = (struct upload_status *)userp;
const char *data;
if((size == 0) || (nmemb == 0) || ((size*nmemb) < 1)) {
return 0;
}
data = payload_text[upload_ctx->lines_read];
if(data) {
size_t len = strlen(data);
memcpy(ptr, data, len);
upload_ctx->lines_read++;
return len;
}
return 0;
}
int main(void)
{
CURL *curl;
CURLcode res = CURLE_OK;
struct curl_slist *recipients = NULL;
struct upload_status upload_ctx;
upload_ctx.lines_read = 0;
curl = curl_easy_init();
if(curl) {
curl_easy_setopt(curl, CURLOPT_USERNAME, "my#gmail.com");
curl_easy_setopt(curl, CURLOPT_PASSWORD, "pass");
curl_easy_setopt(curl, CURLOPT_URL, "smtp://smtp.gmail.com:587");
curl_easy_setopt(curl, CURLOPT_USE_SSL, (long)CURLUSESSL_ALL);
curl_easy_setopt(curl, CURLOPT_CAINFO, "google.pem");
curl_easy_setopt(curl, CURLOPT_MAIL_FROM, FROM);
recipients = curl_slist_append(recipients, TO);
//curl_easy_setopt(curl, CURLOPT_FILE, "edgE0DF.tmp");
curl_easy_setopt(curl, CURLOPT_MAIL_RCPT, recipients);
curl_easy_setopt(curl, CURLOPT_READFUNCTION, payload_source);
curl_easy_setopt(curl, CURLOPT_READDATA, &upload_ctx);
curl_easy_setopt(curl, CURLOPT_UPLOAD, 1L);
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
res = curl_easy_perform(curl);
if(res != CURLE_OK)
fprintf(stderr, "curl_easy_perform() failed: %s\n",
curl_easy_strerror(res));
curl_slist_free_all(recipients);
curl_easy_cleanup(curl);
}
return 0;
}
My question is: how can I make a function which takes as arguments from and to and change my global FROM and TO? Is this possible? Because, I know that global variables are initialized before other parts of code and this means that my static const char *payload_text[] that uses FROM and TO will be initialized with wrong addresses.
I'm trying to upload a file to an http server. I'm getting the 200 OK from the server, but the code below is only transmitting 4 bytes.
size_t myclass::read_callback(void *ptr, size_t size, size_t nmemb, void *userp)
{
handler->read(buffer, buffer_size); // buffer_size is 100000
size_t res = handler->gcount();
if( res == 0 )
return 0;
ptr = buffer; // buffer is array of char, defined in myclass
size = res;
nmemb = sizeof(char);
return 1;
}
void myclass::upload_function(const std::string& url)
{
CURL *curl;
CURLcode res;
std::ifstream if_file;
if_file.open("/path_to_file", std::ios::binary);
handler = &if_file; // handler is defined in myclass
/* In windows, this will init the winsock stuff */
res = curl_global_init(CURL_GLOBAL_DEFAULT);
/* Check for errors */
if(res != CURLE_OK) {
// failure
return;
}
curl = curl_easy_init();
if(curl) {
curl_easy_setopt(curl, CURLOPT_URL, "hostname");
curl_easy_setopt(curl, CURLOPT_POST, 1L);
curl_easy_setopt(curl, CURLOPT_READFUNCTION, myclass::read_callback);
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
struct curl_slist *chunk = NULL;
chunk = curl_slist_append(chunk, "Transfer-Encoding: chunked");
chunk = curl_slist_append(chunk, "Content-Type: application/x-mpegURL");
res = curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk);
/* Perform the request, res will get the return code */
res = curl_easy_perform(curl);
/* Check for errors */
if(res != CURLE_OK) {
// failed
}
else
{
double speed_upload, total_time;
curl_easy_getinfo(curl, CURLINFO_SPEED_UPLOAD, &speed_upload);
curl_easy_getinfo(curl, CURLINFO_TOTAL_TIME, &total_time);
fprintf(stderr, "Speed: %0.3f b/sec during %.3f seconds\n",
speed_upload, total_time);
}
/* always cleanup */
curl_easy_cleanup(curl);
}
curl_global_cleanup();
if_file.close();
}
The callback doesn't seem to copy data to the buffer. It just assigns the local pointer, quite without any effect.
The callback looks like to be a C++ method that can't be used like that as a callback in a C API that doesn't know about C++ objects...
I'm doing a file download with libcurl in my c++ program. How can i detect if the request is a 404, and not do the file write? The code is:
void GameImage::DownloadImage(string file_name) {
string game_name;
game_name = file_name.substr(file_name.find_last_of("/")+1);
CURL *curl;
FILE *fp;
CURLcode res;
string url = "http://site/"+game_name+".png";
string outfilename = file_name+".png";
cout<<"INFO; attempting to download "<<url<<"..."<<endl;
curl = curl_easy_init();
if (curl) {
cout<<"INFO; downloading "<<url<<"..."<<endl;
fp = fopen(outfilename.c_str(), "wb");
cout<<"INFO; trying to open "<<outfilename<<" for file output"<<endl;
if (fp != NULL) {
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, GameImage::WriteData);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
curl_easy_setopt(curl, CURLOPT_FAILONERROR, true);
res = curl_easy_perform(curl);
long http_code = 0;
curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code);
curl_easy_cleanup(curl);
fclose(fp);
}
else {
cout<<"GameImage::DownloadImage; Couldn't open output file"<<endl;
}
}
}
size_t GameImage::WriteData(void *ptr, size_t size, size_t nmemb, FILE *stream) {
size_t written;
written = fwrite(ptr, size, nmemb, stream);
return written;
}
I can delete the 404 response after the transfer occurs, but it would be good to not even save the response.
You can check against CURLE_HTTP_RETURNED_ERROR
This is returned if CURLOPT_FAILONERROR is set to true and the HTTP server returns an error code that is >= 400. You can't grab the specific HTTP response code, but should be enough to accomplish what you want.
I know this is an old post, but the error you're doing is that you're not checking the return value of curl_easy_perform. Setting CURLOPT_FAILONERROR will not crash the program, instead, it will notify you of the error through the return variable you named res. To get rid of the empty file, you could do something like this:
void GameImage::DownloadImage(string file_name) {
string game_name;
game_name = file_name.substr(file_name.find_last_of("/")+1);
CURL *curl;
FILE *fp;
CURLcode res;
string url = "http://site/"+game_name+".png";
string outfilename = file_name+".png";
cout<<"INFO; attempting to download "<<url<<"..."<<endl;
curl = curl_easy_init();
if (curl) {
cout<<"INFO; downloading "<<url<<"..."<<endl;
fp = fopen(outfilename.c_str(), "wb");
cout<<"INFO; trying to open "<<outfilename<<" for file output"<<endl;
if (fp == NULL) {
cout<<"GameImage::DownloadImage; Couldn't open output file"<<endl;
curl_easy_cleanup(curl);
return;
}
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, GameImage::WriteData);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
curl_easy_setopt(curl, CURLOPT_FAILONERROR, true);
res = curl_easy_perform(curl);
fclose(fp);
if (res != CURLE_OK) {
cout<<"GameImage::DownloadImage; Failed to download file"<<endl;
remove(outfilename.c_str());
}
curl_easy_cleanup(curl);
}
}
size_t GameImage::WriteData(void *ptr, size_t size, size_t nmemb, FILE *stream) {
size_t written;
written = fwrite(ptr, size, nmemb, stream);
return written;
}