I'm trying to save site's source code to vector, where every line of source code is a new vector element, because I only need to use one specific line (number 47) in my program. Any idea how to do this?
Load the data from the URL.
Using cURL:
std::vector<char> LoadFromUrl(const std::string& url)
{
struct Content
{
std::vector<char> data;
static size_t Write(char * data, size_t size, size_t nmemb, void * p)
{
return static_cast<Content*>(p)->WriteImpl(data, size, nmemb);
}
size_t WriteImpl(char* ptr, size_t size, size_t nmemb)
{
data.insert(end(data), ptr, ptr + size * nmemb);
return size * nmemb;
}
};
Content content;
CURL* curl = curl_easy_init();
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &content);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &Content::Write);
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_perform(curl);
content.data.push_back('\0');
return content.data;
}
Tokenize the data using strtok or boost tokenizer or your own implementation:
std::vector<std::string> LoadLines(const std::string& url)
{
std::vector<char> content = LoadFromUrl(url);
std::vector<std::string> lines;
for(char* token = strtok(&content.front(), "\n");
token; token = strtok(0, "\n"))
{
lines.push_back(std::string(token));
}
return lines;
}
int main()
{
std::vector<std::string> lines = LoadLines(
"http://stackoverflow.com/questions/10773009/save-sites-source-code-to-vectorstring");
std::copy(begin(lines), end(lines), std::ostream_iterator<std::string>(std::cout, "\n"));
}
Related
I'm trying to use libcurl (http://curl.haxx.se/libcurl/c/) for downloading data from a web, and store these data in a txt file , and here is my code:
// CLASS SinaStk
size_t save_data(char *buffer, size_t size, size_t nmemb, FILE* userdata){
locale loc = std::locale::global(std::locale("")); //TRY TO OPEN FILE WITH CHINESE
userdata = fopen(fpath.c_str(), "w");
if (userdata == NULL)
printf("File not open!\n");
locale::global(loc);
size_t writelen=size * nmemb;
fwrite(buffer, size, nmemb, userdata);
return writelen;
};
virtual void downloadUrl()
{
CURL* stkCURL=NULL;
CURLcode res;
FILE * fp=NULL;
curl_global_init(CURL_GLOBAL_WIN32);
stkCURL = curl_easy_init();
curl_easy_setopt(stkCURL, CURLOPT_URL,"http://hq.sinajs.cn/list=s_sh000001");
curl_easy_setopt(stkCURL, CURLOPT_WRITEFUNCTION, &SinaStk::save_data);
curl_easy_setopt(stkCURL, CURLOPT_WRITEDATA,fp);
res=curl_easy_perform(stkCURL); //<-STOP!!!!
fclose(fp);
curl_easy_cleanup(stkCURL);
curl_global_cleanup();
return;
};
and when I debug my code, it always stop and then jump to xstring:
size_type size() const _NOEXCEPT
{ // return length of sequence
return (this->_Mysize); // <-STOP!!!
}
0xC0000005: Access violation reading location 0x0000009E
I have no idea about the problem for almost a week. I am upset, I asked people around me and nobody knows why.
Thanks for reading, I am really confused.
=============
Problem is solved! Thanks you guys! now my code is:
//CLASS StkApiInfo
size_t writeData(char* buffer, size_t size, size_t nmemb){
if (stkFile.is_open()){
stkFile.close();
stkFile.clear();
};
fpath = "D:\\Code\\代码\\数据文件\\" + fname + ".txt";
stkFile.open(fpath.c_str(), ios::out);
//if (stkFile.is_open())
cout << buffer<<size<<nmemb;
stkFile << buffer<<endl;
stkFile.close();
stkFile.clear();
return size*nmemb;
};
//CLASS SinaStk : public StkApiInfo
static size_t save_data(char *buffer, size_t size, size_t nmemb, void* userdata){
SinaStk* self = (SinaStk*)userdata;
return self->writeData(buffer, size, nmemb);
};
virtual void downloadUrl()
{
CURL* stkCURL = NULL;
CURLcode res;
curl_global_init(CURL_GLOBAL_WIN32);
stkCURL = curl_easy_init();
if (stkCURL)
{
curl_easy_setopt(stkCURL, CURLOPT_URL, stkUrl.c_str());
curl_easy_setopt(stkCURL, CURLOPT_WRITEFUNCTION, &SinaStk::save_data);
curl_easy_setopt(stkCURL, CURLOPT_WRITEDATA, this);
res = curl_easy_perform(stkCURL);
//if (res != CURLE_OK)
curl_easy_cleanup(stkCURL);
curl_global_cleanup();
}
return;
};
Callback passed with CURLOPT_WRITEFUNCTION argument should be of type write_callback (with exact that signature) and therefore cannot be non-static class method. Usual workaround is to define callback as non-member or static method and pass this as an argument:
static size_t save_data(char *buffer, size_t size, size_t nmemb, void* userdata)
{
SinaStk* self = (SinaStk*) userdata;
return self->doStuff(buffer, size, nmemb);
}
virtual void downloadUrl()
{
//...
curl_easy_setopt(stkCURL, CURLOPT_WRITEFUNCTION, &SinaStk::save_data);
curl_easy_setopt(stkCURL, CURLOPT_WRITEDATA, this);
//...
}
If you need to access additional data (like FILE* in your example) you can either store it as class field or introduce temporary structure that would contain this and additional data fields and pass it's address as callback argument.
Disclaimer: I am not asking anyone to debug this code, I am more interested to know if anyone sees that I am using libcurl improperly, because as far as I can tell, I am following the documentation exactly.
The problem is in the MakeRequest() method. At curl_easy_perform(), I get std output of
* About to connect() to dynamodb.us-east-1.amazonaws.com port 80 (#0)
* Trying 72.21.195.244... * connected
Then a segfault.
Here is the stack trace:
Thread [1] 30267 [core: 0] (Suspended : Signal : SIGSEGV:Segmentation fault)
Curl_getformdata() at 0x7ffff79069bb
Curl_http() at 0x7ffff790b178
Curl_do() at 0x7ffff791a298
Curl_do_perform() at 0x7ffff7925457
CurlHttpClient::MakeRequest() at CurlHttpClient.cpp:91 0x7ffff7ba17f5
AWSClient::MakeRequest() at AWSClient.cpp:54 0x7ffff7bbac4d
DynamoDbV2Client::GetItem() at DynamoDbV2Client.cpp:34 0x7ffff7bb7380
GetItemResultTest_TestLiveRequest_Test::TestBody() at GetItemResultTest.cpp:88 0x43db5a
testing::internal::HandleSehExceptionsInMethodIfSupported<testing::Test, void>() at gtest-all.cc:3,562 0x46502f
testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, void>() at gtest-all.cc:3,598 0x4602f6
<...more frames...>
Here is the code in question.
#include "http/curl/CurlHttpClient.h"
#include "http/standard/StandardHttpResponse.h"
#include "utils/StringUtils.h"
#include <curl/curl.h>
#include <sstream>
#include <algorithm>
#include <functional>
#include <vector>
bool CurlHttpClient::isInit = false;
void SetOptCodeForHttpMethod(CURL* requestHandle, HttpMethod method)
{
switch (method)
{
case GET:
curl_easy_setopt(requestHandle, CURLOPT_HTTPGET, 1);
break;
case POST:
curl_easy_setopt(requestHandle, CURLOPT_HTTPPOST, 1);
break;
case PUT:
curl_easy_setopt(requestHandle, CURLOPT_PUT, 1);
break;
default:
curl_easy_setopt(requestHandle, CURLOPT_CUSTOMREQUEST, "DELETE");
break;
}
}
CurlHttpClient::CurlHttpClient()
{
if (!isInit)
{
isInit = true;
curl_global_init(CURL_GLOBAL_ALL);
}
}
CurlHttpClient::~CurlHttpClient()
{
}
HttpResponse* CurlHttpClient::MakeRequest(const HttpRequest& request) const
{
struct curl_slist* headers = NULL;
std::stringstream headerStream;
HeaderValueCollection requestHeaders = request.GetHeaders();
for (HeaderValueCollection::iterator iter = requestHeaders.begin();
iter != requestHeaders.end(); ++iter)
{
headerStream.str("");
headerStream << iter->first << ": " << iter->second;
headers = curl_slist_append(headers, headerStream.str().c_str());
}
CURL* singleRequestHandle = curl_easy_init();
HttpResponse* response = NULL;
if (singleRequestHandle)
{
if (headers)
{
curl_easy_setopt(singleRequestHandle, CURLOPT_HTTPHEADER, headers);
}
if(request.GetMethod() == HttpMethod::POST)
{
curl_easy_setopt(singleRequestHandle, CURLOPT_POSTFIELDS, request.GetUri().GetFormParameters().c_str());
}
response = new StandardHttpResponse(request);
SetOptCodeForHttpMethod(singleRequestHandle, request.GetMethod());
std::string url = request.GetURIString(false);
curl_easy_setopt(singleRequestHandle, CURLOPT_URL, url.c_str());
curl_easy_setopt(singleRequestHandle, CURLOPT_WRITEFUNCTION, &CurlHttpClient::WriteData);
curl_easy_setopt(singleRequestHandle, CURLOPT_WRITEDATA, response);
curl_easy_setopt(singleRequestHandle, CURLOPT_HEADERFUNCTION, &CurlHttpClient::WriteHeader);
curl_easy_setopt(singleRequestHandle, CURLOPT_HEADERDATA, response);
if (request.GetContentBody() != NULL)
{
curl_easy_setopt(singleRequestHandle, CURLOPT_POSTFIELDSIZE, request.GetContentBody()->tellp());
curl_easy_setopt(singleRequestHandle, CURLOPT_READFUNCTION, &CurlHttpClient::ReadBody);
curl_easy_setopt(singleRequestHandle, CURLOPT_READDATA, &request);
}
curl_easy_setopt(singleRequestHandle, CURLOPT_VERBOSE, 1L);
curl_easy_perform(singleRequestHandle);
int responseCode;
curl_easy_getinfo(singleRequestHandle, CURLINFO_RESPONSE_CODE, &responseCode);
response->SetResponseCode((HttpResponseCode) responseCode);
char* contentType = NULL;
curl_easy_getinfo(singleRequestHandle, CURLINFO_CONTENT_TYPE, &contentType);
response->SetContentType(contentType);
curl_easy_cleanup(singleRequestHandle);
}
if (headers)
{
curl_slist_free_all(headers);
}
return response;
}
size_t CurlHttpClient::WriteData(char *ptr, size_t size, size_t nmemb, void* userdata)
{
if (ptr)
{
HttpResponse* response = (HttpResponse*)userdata;
if (!response->GetResponseBody())
{
std::streambuf* strBuffer = new std::stringbuf;
response->SetResponseBody(new std::iostream(strBuffer));
}
int sizeToWrite = size * nmemb;
response->GetResponseBody()->write(ptr, sizeToWrite);
return sizeToWrite;
}
return 0;
}
size_t CurlHttpClient::WriteHeader(char *ptr, size_t size, size_t nmemb, void* userdata)
{
if (ptr)
{
HttpResponse* response = (HttpResponse*)userdata;
std::string headerLine(ptr);
std::vector<std::string> keyValuePair = StringUtils::Split(headerLine, ':');
if (keyValuePair.size() == 2)
{
std::string headerName = keyValuePair[0];
headerName = StringUtils::Trim(headerName);
std::string headerValue = keyValuePair[1];
headerValue = StringUtils::Trim(headerValue);
response->AddHeader(headerName, headerValue);
}
return size * nmemb;
}
return 0;
}
size_t CurlHttpClient::ReadBody(char* ptr, size_t size, size_t nmemb, void* userdata)
{
HttpRequest* request = (HttpRequest*)userdata;
std::shared_ptr<std::iostream> outputStream = request->GetContentBody();
if (outputStream != NULL && size * nmemb)
{
size_t written = outputStream->readsome(ptr, size * nmemb);
return written;
}
return 0;
}
For reference here is the definition for CurlHttpClient:
//Curl implementation of an http client. Right now it is only synchronous.
class CurlHttpClient : public HttpClient
{
public:
//Creates client, intializes curl handle if it hasn't been created already.
CurlHttpClient();
//cleans up curl lib
virtual ~CurlHttpClient();
//Makes request and recieves response synchronously
virtual HttpResponse* MakeRequest(const HttpRequest& request) const;
private:
//Callback to read the content from the content body of the request
static size_t ReadBody(char* ptr, size_t size, size_t nmemb, void* userdata);
//callback to write the content from the response to the response object
static size_t WriteData( char* ptr, size_t size, size_t nmemb, void* userdata);
//callback to write the headers from the response to the response
static size_t WriteHeader( char* ptr, size_t size, size_t nmemb, void* userdata);
//init flag.
static bool isInit;
};
One definite problem I see with the code is
curl_easy_setopt(requestHandle, CURLOPT_HTTPPOST, 1);
CURLOPT_HTTPPOST expects a pointer to a structure of type struct curl_httppost. Passing 1 creates a dangling pointer. You probably might want to use the CURLOPT_POST instead.
I read a few articles on c++ / curl here on stackoverflow and assembled the following.
The main goal is to handle the whole request in an instance of a class -- and maybe later in a secondary thread.
My problem is: "content_" seems to stay empty though its the same addr and
HttpFetch.h:
class HttpFetch
{
private:
CURL *curl;
static size_t handle(char * data, size_t size, size_t nmemb, void * p);
size_t handle_impl(char * data, size_t size, size_t nmemb);
public:
std::string content_;
static std::string url_;
HttpFetch(std::string url);
void start();
std::string data();
};
HttpFetch.cpp:
HttpFetch::HttpFetch(std::string url) {
curl_global_init(CURL_GLOBAL_ALL); //pretty obvious
curl = curl_easy_init();
content_.append("Test");
std::cout << &content_ << "\n";
curl_easy_setopt(curl, CURLOPT_URL, &url);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &content_);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &HttpFetch::handle);
//curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); //tell curl to output its progress
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
//std::cout << &content_ << "\n";
}
void HttpFetch::start() {
curl_easy_perform(curl);
curl_easy_cleanup(curl);
}
size_t HttpFetch::handle(char * data, size_t size, size_t nmemb, void * p)
{
std::string *stuff = reinterpret_cast<std::string*>(p);
stuff->append(data, size * nmemb);
std::cout << stuff << "\n"; // has content from data in it!
return size * nmemb;
}
main.cpp:
#include "HttpFetch.h"
int main(int argc, const char * argv[])
{
HttpFetch call = *new HttpFetch("http://www.example.com");
call.start();
::std::cout << call.content_ << "\n"
}
Thanks in advance
There are several problems with your code. The main problem is the line
HttpFetch call = *new HttpFetch("http://www.example.com");
You create a new HttpFetch instance and copy it to another one. So you have two instances and two content strings. To remove this issue change it to:
HttpFetch call("http://www.example.com");
Another error is the line
curl_easy_setopt(curl, CURLOPT_URL, &url);
which should be
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
You could have avoided such issues if you had thought about resource managment in a class like HttpFetch. Since HttpFetch manages a resource (a curl handle) you have to think about how to initialize and cleanup this resource an how to handle copy, assignment or move. If you use C++11 the easiest solution is to use a std::unique_ptr which handles all that stuff for you.
class HttpFetch
{
public:
HttpFetch(const std::string& url);
void start();
void Print(std::ostream& stream);
private:
typedef void (*cleanup)(CURL*);
typedef std::unique_ptr<CURL, cleanup> CurlHandle;
CurlHandle curlHandle;
std::string content_;
static size_t handle(char * data, size_t size, size_t nmemb, void * p);
};
HttpFetch::HttpFetch(const std::string& url)
: curlHandle(curl_easy_init(), &curl_easy_cleanup)
{
curl_easy_setopt(curlHandle.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curlHandle.get(), CURLOPT_WRITEDATA, &content_);
curl_easy_setopt(curlHandle.get(), CURLOPT_WRITEFUNCTION, &HttpFetch::handle);
curl_easy_setopt(curlHandle.get(), CURLOPT_FOLLOWLOCATION, 1L);
}
size_t HttpFetch::handle(char * data, size_t size, size_t nmemb, void * p){
std::string *stuff = static_cast<std::string*>(p);
stuff->append(data, size * nmemb);
return size * nmemb;
}
void HttpFetch::start() {
content_.clear();
curl_easy_perform(curlHandle.get());
}
void HttpFetch::Print(std::ostream& stream){
stream << content_;
}
int main()
{
//HttpFetch call = *new HttpFetch("..."); // this is a copiler error now
HttpFetch call("http://www.google.com");
call.start();
call.Print(std::cout);
}
Using a unique_ptr your class HttpFetch becomes uncopyable and movable only. This makes sense until you provide logic to copy or share an CURL handle between different instances of HttpFetch.
i have python script that download exchange rates from web page, and i want make c++ program from that, here is what i have so far:
include iostream
include time.h
include stdio.h
include curl/curl.h
include curl/easy.h
include string
define CURL_STATICLIB
using namespace std;
void dat(string &d){
time_t rawtime;
struct tm * timeinfo;
char datum[80];
time ( &rawtime );
timeinfo=localtime(&rawtime);
strftime(datum,80,"%d%m%y",timeinfo);
d=datum;
}
size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) {
size_t written;
written = fwrite(ptr, size, nmemb, stream);
return written;
}
int main()
{
string f;
dat(f);
string l1="http://www.hnb.hr/tecajn/f";
string l2=".dat";
string linkz=l1+f+l2;
cout << linkz;
CURL *curl;
FILE *fp;
CURLcode res;
char *url = linkz;
char outfilename[FILENAME_MAX] = "/home/tomi/data.txt";
curl = curl_easy_init();
if (curl) {
fp = fopen(outfilename,"wb");
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
res = curl_easy_perform(curl);
curl_easy_cleanup(curl);
fclose(fp);
return 0;
}
it give me this error when i try to compile, i found used algorythm for download txt, so i hope it is correct
If you'd have pointed out the line where you got the error, I wouldn't have had to track it down to:
string linkz=l1+f+l2;
...
char *url = linkz;
You can use c_str() to get a pointer to the const characters in the string. So this will do:
char const* url = linkz.c_str();
You could have that very same line in the setopt call, or have url be an std::string as well.
char *url = linkz; should be const char* url = linkz.c_str(); assuming you really need a C-style string for API reasons.
The problem is in this line:
char *url = linkz;
"links" is an std::string, but"url" is a char *. Try using the c_str method of string to get what you need like so:
const char * url = links.c_str();
I'd like to use the libcurl library to open a remote date file and iterate through it with an istream. I've looked through the nice example in this thread but it writes the remote file to a local file. Instead I'd like to have the remote reads be pushed to an istream for subsequent programmatic manipulation. Is this possible? I would greatly appreciate help.
Best,
Aaron
Boost's IO Stream might be a better solution than STL's own stream. At least it is much simpler to create a boost stream. From boost's own docs:
#include <curl/curl.h>
#include <boost/iostreams/stream.hpp>
class CURLDevice
{
private:
CURL* handle;
public:
typedef char char_type;
typedef boost::iostreams::source_tag category;
CURLDevice()
{
handle = curl_easy_init();
}
CURLDevice(const std::string &url)
{
handle = curl_easy_init();
open( url );
}
~CURLDevice()
{
curl_easy_cleanup(handle);
}
void open(const std::string &url)
{
curl_easy_setopt(handle, CURLOPT_URL, url.c_str());
curl_easy_setopt(handle, CURLOPT_CONNECT_ONLY, 1);
curl_easy_perform(handle);
}
std::streamsize read(char* s, std::streamsize n)
{
size_t read;
CURLcode ret = curl_easy_recv(handle, s, n, &read);
if ( ret == CURLE_OK || ret == CURLE_AGAIN )
return read;
else
return -1;
}
};
typedef boost::iostreams::stream<CURLDevice> CURLStream;
int main(int argc, char **argv)
{
curl_global_init(CURL_GLOBAL_ALL);
{
CURLStream stream("http://google.com");
char buffer[256];
int sz;
do
{
sz = 256;
stream.read( buffer, sz );
sz = stream.gcount();
std::cout.write( buffer, sz );
}
while( sz > 0 );
}
curl_global_cleanup();
return 0;
}
Note: when I run the code above I get a segfault in CURL, this appears to be because I don't know exactly how to use curl itself.