Import CSV into Vertica using Rfc4180CsvParser and exclude header row - c++

Is there a way to exclude the header row when importing data via the Rfc4180CsvParser? The COPY command has a SKIP option but the option doesn't seem to work when using the CSV parsers provided in the Vertica SDK.
Background
As background, the COPY command does not read CSV files by itself. For simple CSV files, one can say COPY schema.table FROM '/data/myfile.csv' DELIMITER ',' ENCLOSED BY '"'; but this will fail with data files which have string values with embedded quotes.
Adding ESCAPE AS '"' will generate an error ERROR 3169: ENCLOSED BY and ESCAPE AS can not be the same value . This is a problem as CSV values are enclosed and escaped by ".
Vertica SDK CsvParser extensions to the rescue
Vertica provides an SDK under /opt/vertica/sdk/examples with C++ programs that can be compiled into extensions. One of these is /opt/vertica/sdk/examples/ParserFunctions/Rfc4180CsvParser.cpp.
This works great as follows:
cd /opt/vertica/sdk/examples
make clean
vsql
==> CREATE LIBRARY Rfc4180CsvParserLib AS '/opt/vertica/sdk/examples/build/Rfc4180CsvParser.so';
==> COPY myschema.mytable FROM '/data/myfile.csv' WITH PARSER Rfc4180CsvParser();
Problem
The above works great except that it imports the first row of the data file as a row. The COPY command has a SKIP 1 option but this does not work with the parser.
Question
Is it possble to edit Rfc4180CsvParser.cpp to skip the first row, or better yet, take some parameter to specify number of rows to skip?
The program is just 135 lines but I don't see where/how to make this incision. Hints?
Copying the entire program below as I don't see a public repo to link to...
Rfc4180CsvParser.cpp
/* Copyright (c) 2005 - 2012 Vertica, an HP company -*- C++ -*- */
#include "Vertica.h"
#include "StringParsers.h"
#include "csv.h"
using namespace Vertica;
// Note, the class template is mostly for demonstration purposes,
// so that the same class can use each of two string-parsers.
// Custom parsers can also just pick a string-parser to use.
/**
* A parser that parses something approximating the "official" CSV format
* as defined in IETF RFC-4180: <http://tools.ietf.org/html/rfc4180>
* Oddly enough, many "CSV" files don't actually conform to this standard
* for one reason or another. But for sources that do, this parser should
* be able to handle the data.
* Note that the CSV format does not specify how to handle different
* data types; it is entirely a string-based format.
* So we just use standard parsers based on the corresponding column type.
*/
template <class StringParsersImpl>
class LibCSVParser : public UDParser {
public:
LibCSVParser() : colNum(0) {}
// Keep a copy of the information about each column.
// Note that Vertica doesn't let us safely keep a reference to
// the internal copy of this data structure that it shows us.
// But keeping a copy is fine.
SizedColumnTypes colInfo;
// An instance of the class containing the methods that we're
// using to parse strings to the various relevant data types
StringParsersImpl sp;
/// Current column index
size_t colNum;
/// Parsing state for libcsv
struct csv_parser parser;
// Format strings
std::vector<std::string> formatStrings;
/**
* Given a field in string form (a pointer to the first character and
* a length), submit that field to Vertica.
* `colNum` is the column number from the input file; how many fields
* it is into the current record.
*/
bool handleField(size_t colNum, char* start, size_t len) {
if (colNum >= colInfo.getColumnCount()) {
// Ignore column overflow
return false;
}
// Empty colums are null.
if (len==0) {
writer->setNull(colNum);
return true;
} else {
return parseStringToType(start, len, colNum, colInfo.getColumnType(c
olNum), writer, sp);
}
}
static void handle_record(void *data, size_t len, void *p) {
static_cast<LibCSVParser*>(p)->handleField(static_cast<LibCSVParser*>(p)
->colNum++, (char*)data, len);
}
static void handle_end_of_row(int c, void *p) {
// Ignore 'c' (the terminating character); trust that it's correct
static_cast<LibCSVParser*>(p)->colNum = 0;
static_cast<LibCSVParser*>(p)->writer->next();
}
virtual StreamState process(ServerInterface &srvInterface, DataBuffer &input
, InputState input_state) {
size_t processed;
while ((processed = csv_parse(&parser, input.buf + input.offset, input.s
ize - input.offset,
handle_record, handle_end_of_row, this)) > 0) {
input.offset += processed;
}
if (input_state == END_OF_FILE && input.size == input.offset) {
csv_fini(&parser, handle_record, handle_end_of_row, this);
return DONE;
}
return INPUT_NEEDED;
}
virtual void setup(ServerInterface &srvInterface, SizedColumnTypes &returnTy
pe);
virtual void destroy(ServerInterface &srvInterface, SizedColumnTypes &return
Type) {
csv_free(&parser);
}
};
template <class StringParsersImpl>
void LibCSVParser<StringParsersImpl>::setup(ServerInterface &srvInterface, Sized
ColumnTypes &returnType) {
csv_init(&parser, CSV_APPEND_NULL);
colInfo = returnType;
}
template <>
void LibCSVParser<FormattedStringParsers>::setup(ServerInterface &srvInterface,
SizedColumnTypes &returnType) {
csv_init(&parser, CSV_APPEND_NULL);
colInfo = returnType;
if (formatStrings.size() != returnType.getColumnCount()) {
formatStrings.resize(returnType.getColumnCount(), "");
}
sp.setFormats(formatStrings);
}
template <class StringParsersImpl>
class LibCSVParserFactoryTmpl : public ParserFactory {
public:
virtual void plan(ServerInterface &srvInterface,
PerColumnParamReader &perColumnParamReader,
PlanContext &planCtxt) {}
virtual UDParser* prepare(ServerInterface &srvInterface,
PerColumnParamReader &perColumnParamReader,
PlanContext &planCtxt,
const SizedColumnTypes &returnType)
{
return vt_createFuncObj(srvInterface.allocator,
LibCSVParser<StringParsersImpl>);
}
};
typedef LibCSVParserFactoryTmpl<StringParsers> LibCSVParserFactory;
RegisterFactory(LibCSVParserFactory);
typedef LibCSVParserFactoryTmpl<FormattedStringParsers> FormattedLibCSVParserFac
tory;
RegisterFactory(FormattedLibCSVParserFactory);

The quick and dirty way would be to just hardcode it. It's using a callback to handle_end_of_row. Track the row number and just don't process the first row . Something like:
static void handle_end_of_row(int c, void *ptr) {
// Ignore 'c' (the terminating character); trust that it's correct
LibCSVParser *p = static_cast<LibCSVParser*>(ptr);
p->colNum = 0;
if (rowcnt <= 0) {
p->bad_field = "";
rowcnt++;
} else if (p->bad_field.empty()) {
p->writer->next();
} else {
// libcsv doesn't give us the whole row to reject.
// So just write to the log.
// TODO: Come up with something more clever.
if (p->currSrvInterface) {
p->currSrvInterface->log("Invalid CSV field value: '%s' Row skipped.",
p->bad_field.c_str());
}
p->bad_field = "";
}
}
Also, best to initialize rownum = 0 in process since I think it will call this for each file in your COPY statement. There might be more clever ways of doing this. Basically, this will just process the record and then discard it.
As for supporting SKIP generically... look at TraditionalCSVParser for how to handle parameter passing. You'd have to add it to the parser factor prepare and send in the value to the LibCSVParser class and override getParameterType. Then in LibCSVParser you need to accept the parameter in the constructor, and modify process to skip the first skip rows. Then use that value instead of the hardcoded 0 above.

Related

Byte offset greater than Byte Length in BufferView

I'm trying to read data from scene.bin files using Microsoft::glTF SDK. TinyGLTF is not an option. When I try to read MeshPrimitive attribute called TEXCOORD_0 i get a situation where BufferView byteOffset is greater than byteLength. Therefore, I don't know how to properly read given data and my program crashes.
I tried reading data using IStreamReader which is a part of SDK, and is a must when reading bin files using this SDK. I calculate data offset by adding accessor.byteOffset + bufferView.byteOffset which is > byteLength.
struct BuffersAccessors {
Microsoft::glTF::Accessor accessor;
Microsoft::glTF::BufferView view;
Microsoft::glTF::Buffer buffer;
void operator=(BuffersAccessors accessors);
};
template<typename T> struct BufferInfo {
BuffersAccessors buffersAccessors;
std::vector<T> bufferData;
BufferInfo<T>();
BufferInfo<T>(BuffersAccessors buffersAccessors, std::vector<T> bufferData);
const void operator=(const BufferInfo<T> &info) {
buffersAccessors = info.buffersAccessors;
bufferData = info.bufferData;
};
};
template<typename T>
std::vector<T> readBufferData(Microsoft::glTF::Document document, BufferInfo<T> bufferInfo, std::filesystem::path path) {
std::vector<T> stream;
if (bufferInfo.buffersAccessors.buffer.uri.length() > 0 || bufferInfo.buffersAccessors.buffer.byteLength > 0) {
Microsoft::glTF::Buffer buffer = bufferInfo.buffersAccessors.buffer;
path += bufferInfo.buffersAccessors.buffer.uri;
path = std::filesystem::absolute(path);
buffer.uri = path.string();
std::shared_ptr<StreamReader> streamReader = std::make_shared<StreamReader>(path);
Microsoft::glTF::GLTFResourceReader reader(streamReader);
stream = reader.ReadBinaryData<T>(buffer, bufferInfo.buffersAccessors.view);
}
return stream;
}
template<typename T>
BufferInfo<T> getFullBufferData(Microsoft::glTF::Document document, std::string accessorKey, std::filesystem::path path) {
BufferInfo<T> bufferInfo{};
BuffersAccessors mainPart = getBufferAccessorFromDocument(document, accessorKey);
bufferInfo.buffersAccessors = mainPart;
std::vector<T> bufferData = vkglTF::readBufferData<T>(document, bufferInfo, path);
const size_t bufferDataOffset = mainPart.accessor.byteOffset + mainPart.view.byteOffset; //How to properly calculate offset?
bufferData.erase(bufferData.begin(), bufferData.begin() + bufferDataOffset);
bufferInfo.bufferData = bufferData;
return bufferInfo;
}
I expect data in formats like uint8 and uint16 but my program crashes when trying to do bufferData.erase(..).
Edit: This happens while reading WEIGHTS_0 too.
I think the most likely error with your code is the mixing of byte offsets and vector element indices. Have you tried dividing bufferDataOffset by sizeof(T)?
Second, if you only want to read an accessor's data then try using the ReadBinaryData overload that accepts an Accessor parameter instead. That way the glTF SDK will handle all of the offset calculations for you.
There is no documentation but the deserialize sample demonstrates the basic code structure recommended when using the glTF SDK.

Why can't one clone a `Space` in Gecode before solving the original one?

I'm looking for a way to copy Space instances in Gecode and then analyze the difference between the spaces later.
However it goes already wrong after the first copy. When one copies the code in the book Modelling and Programming in Gecode, as shown here below, and simply modifies it such that a copy is made first (SendMoreMoney* smm = m->copy(true);), one gets a Segmentation fault, regardless whether the shared option is true or false.
#include <gecode/int.hh>
#include <gecode/search.hh>
using namespace Gecode;
class SendMoreMoney : public Space {
protected:
IntVarArray l;
public:
SendMoreMoney(void) : l(*this, 8, 0, 9) {
IntVar s(l[0]), e(l[1]), n(l[2]), d(l[3]),
m(l[4]), o(l[5]), r(l[6]), y(l[7]);
// no leading zeros
rel(*this, s, IRT_NQ, 0);
rel(*this, m, IRT_NQ, 0);
// all letters distinct
distinct(*this, l);
// linear equation
IntArgs c(4+4+5); IntVarArgs x(4+4+5);
c[0]=1000; c[1]=100; c[2]=10; c[3]=1;
x[0]=s; x[1]=e; x[2]=n; x[3]=d;
c[4]=1000; c[5]=100; c[6]=10; c[7]=1;
x[4]=m; x[5]=o; x[6]=r; x[7]=e;
c[8]=-10000; c[9]=-1000; c[10]=-100; c[11]=-10; c[12]=-1;
x[8]=m; x[9]=o; x[10]=n; x[11]=e; x[12]=y;
linear(*this, c, x, IRT_EQ, 0);
// post branching
branch(*this, l, INT_VAR_SIZE_MIN(), INT_VAL_MIN());
}
// search support
SendMoreMoney(bool share, SendMoreMoney& s) : Space(share, s) {
l.update(*this, share, s.l);
}
virtual SendMoreMoney* copy(bool share) {
return new SendMoreMoney(share,*this);
}
// print solution
void print(void) const {
std::cout << l << std::endl;
}
};
// main function
int main(int argc, char* argv[]) {
// create model and search engine
SendMoreMoney* m = new SendMoreMoney;
SendMoreMoney* mc = m->copy(true);
DFS<SendMoreMoney> e(m);
delete m;
// search and print all solutions
while (SendMoreMoney* s = e.next()) {
s->print(); delete s;
}
return 0;
}
How can one make a real copy?
You have to call status() on the Space first.
I found this exchange in the Gecode mailing list archives: https://www.gecode.org/users-archive/2006-March/000439.html
It would seem that internally, Gecode uses the copy function and constructor for its own internal purposes, so to make a "copy-by-value" copy of a space, you need to use the clone() function defined in the Space interface. However, as noted in #Anonymous answer, you need to call status() before calling clone or it will throw an exception of type SpaceNotStable
I augmented my space with the function below to automatically call status, make the clone, and return a pointer of my derived type:
struct Example : public Space {
...
Example * cast_clone() {
status();
return static_cast<Example *>(this->clone());
}
...
}
As a workaround, one can create a totally independent space and then use equality constraints
on the variable level to reduce the domains of these variables.
Example:
void cloneHalfValues(SendMoreMoney* origin) {
int n = l.size();
for(int i = 0x00; i < n/2; i++) {
if(origin->l[i].assigned()) {
rel(*this, l[i], IRT_EQ, origin->l[i].val());
}
}
}
The reason why one can't clone a Space is however still a mystery.

c++ protobuf: how to iterate through fields of message?

I'm new to protobuf and I'm stuck with simple task: I need to iterate through fields of message and check it's type. If type is message I will do same recursively for this message.
For example, I have such messages:
package MyTool;
message Configuration {
required GloablSettings globalSettings = 1;
optional string option1 = 2;
optional int32 option2 = 3;
optional bool option3 = 4;
}
message GloablSettings {
required bool option1 = 1;
required bool option2 = 2;
required bool option3 = 3;
}
Now, to explicitly access a field value in C++ I can do this:
MyTool::Configuration config;
fstream input("config", ios::in | ios::binary);
config.ParseFromIstream(&input);
bool option1val = config.globalSettings().option1();
bool option2val = config.globalSettings().option2();
and so on. This approach is not convenient in case when have big amount of fields.
Can I do this with iteration and get field's name and type? I know there are descriptors of type and somewhat called reflection, but I didn't have success in my attempts.
Can some one give me example of code if it's possible?
Thanks!
This is old but maybe someone will benefit. Here is a method that prints the contents of a protobuf message:
void Example::printMessageContents(std::shared_ptr<google::protobuf::Message> m)
{
const Descriptor *desc = m->GetDescriptor();
const Reflection *refl = m->GetReflection();
int fieldCount= desc->field_count();
fprintf(stderr, "The fullname of the message is %s \n", desc->full_name().c_str());
for(int i=0;i<fieldCount;i++)
{
const FieldDescriptor *field = desc->field(i);
fprintf(stderr, "The name of the %i th element is %s and the type is %s \n",i,field->name().c_str(),field->type_name());
}
}
You can find in FieldDescriptor Enum Values the possible values you get from field->type. For example for the message type you would have to check if type is equal to FieldDescriptor::TYPE_MESSAGE.
This function prints all the "metadata" of the protobuf message. However you need to check separately for each value what the type is and then call the corresponding getter function using Reflection.
So using this condition we could extract the strings :
if(field->type() == FieldDescriptor::TYPE_STRING && !field->is_repeated())
{
std::string g= refl->GetString(*m, field);
fprintf(stderr, "The value is %s ",g.c_str());
}
However fields can be either repeated or not repeated and different methods are used for both field types. So a check is used here to assure that we are using the right method. For repeated fields we have for example this method for strings :
GetRepeatedString(const Message & message, const FieldDescriptor * field, int index)
So it takes the index of the repeated field into consideration.
In the case of FieldDescriptor of type Message, the function provided will only print the name of the message, we better print its contents too.
if(field->type()==FieldDescriptor::TYPE_MESSAGE)
{
if(!field->is_repeated())
{
const Message &mfield = refl->GetMessage(*m, field);
Message *mcopy = mfield.New();
mcopy->CopyFrom(mfield);
void *ptr = new std::shared_ptr<Message>(mcopy);
std::shared_ptr<Message> *m =
static_cast<std::shared_ptr<Message> *>(ptr);
printMessageContents(*m);
}
}
And finally if the field is repeated you will have to call the FieldSize method on the reflection and iterate all repeated fields.
Take a look at how the Protobuf library implements the TextFormat::Printer class, which uses descriptors and reflection to iterate over fields and convert them to text:
https://github.com/google/protobuf/blob/master/src/google/protobuf/text_format.cc#L1473

Is there an elegant way to cascade-merge two JSON trees using jsoncpp?

I am using jsoncpp to read settings from a JSON file.
I would like to have two cascading settings file, say MasterSettings.json and LocalSettings.json where LocalSettings is a subset of MasterSettings. I would like to load MasterSettings first and then LocalSettings. Where LocalSettings has a value that differs from MasterSettings, that value would overwrite the one from MasterSettings. Much like the cascade in CSS.
Is there any elegant way to do this with jsoncpp?
I'm going to assume your settings files are JSON objects.
As seen here, when JSONCpp parses a file, it clears the contents of the root node. This mean that trying to parse a new file on top of the old one won't preserve the old data. However, if you parse both files into separate Json::Value nodes, it's straight forward to recursively copy the values yourself by iterating over the keys in the second object using getMemberNames.
// Recursively copy the values of b into a. Both a and b must be objects.
void update(Json::Value& a, Json::Value& b) {
if (!a.isObject() || !b.isObject()) return;
for (const auto& key : b.getMemberNames()) {
if (a[key].isObject()) {
update(a[key], b[key]);
} else {
a[key] = b[key];
}
}
}
I know it has been a while. but...
In addition to the correct answer and the commentary, here is a code version for those who use a older g++ version:
void jsonMerge(Json::Value &a, Json::Value &b) {
if (!a.isObject() || !b.isObject()) return;
vector<string> member_name = b.getMemberNames();
string key = "";
for (unsigned i = 0, len = member_name.size(); i < len; i++) {
key = member_name[i];
if (!a[key].isNull() && a[key].type() == Json::objectValue && b[key].type() == Json::objectValue) {
jsonMerge(a[key], b[key]);
} else {
a[key] = b[key];
}
}
member_name.clear();
}

How to wrap UTF-8 encoded C++ std::strings with Swig in C#?

My question is nearly identical to this question, except that the linked question deals with char*, whereas I'm using std::string in my code. Like the linked question, I'm also using C# as my target language.
I have a class written in C++:
class MyClass
{
public:
const std::string get_value() const; // returns utf8-string
void set_value(const std::string &value); // sets utf8-string
private:
// ...
};
And this get's wrapped by SWIG in C# as follows:
public class MyClass
{
public string get_value();
public void set_value(string value);
}
SWIG does everything for me, except that it doesn't make an utf8 to utf16 string conversion during the calls to MyClass. My strings come through fine if they are representable in ASCII, but if I try passing a string with non-ascii characters in a round-trip through "set_value" and "get_value", I end up with unintelligible characters.
How can I make SWIG wrap UTF-8 encoded C++ strings in C#? n.b. I'm using std::string, not std::wstring, and not char*.
There's a partial solution on the SWIG sourceforge site, but it deals with char* not std::string, and it uses a (configurable) fixed length buffer.
With the help (read: genius!) of David Jeske in the linked Code Project article, I have finally been able to answer this question.
You'll need this class (from David Jeske's code) in your C# library.
public class UTF8Marshaler : ICustomMarshaler {
static UTF8Marshaler static_instance;
public IntPtr MarshalManagedToNative(object managedObj) {
if (managedObj == null)
return IntPtr.Zero;
if (!(managedObj is string))
throw new MarshalDirectiveException(
"UTF8Marshaler must be used on a string.");
// not null terminated
byte[] strbuf = Encoding.UTF8.GetBytes((string)managedObj);
IntPtr buffer = Marshal.AllocHGlobal(strbuf.Length + 1);
Marshal.Copy(strbuf, 0, buffer, strbuf.Length);
// write the terminating null
Marshal.WriteByte(buffer + strbuf.Length, 0);
return buffer;
}
public unsafe object MarshalNativeToManaged(IntPtr pNativeData) {
byte* walk = (byte*)pNativeData;
// find the end of the string
while (*walk != 0) {
walk++;
}
int length = (int)(walk - (byte*)pNativeData);
// should not be null terminated
byte[] strbuf = new byte[length];
// skip the trailing null
Marshal.Copy((IntPtr)pNativeData, strbuf, 0, length);
string data = Encoding.UTF8.GetString(strbuf);
return data;
}
public void CleanUpNativeData(IntPtr pNativeData) {
Marshal.FreeHGlobal(pNativeData);
}
public void CleanUpManagedData(object managedObj) {
}
public int GetNativeDataSize() {
return -1;
}
public static ICustomMarshaler GetInstance(string cookie) {
if (static_instance == null) {
return static_instance = new UTF8Marshaler();
}
return static_instance;
}
}
Then, in Swig's "std_string.i", on line 24 replace this line:
%typemap(imtype) string "string"
with this line:
%typemap(imtype, inattributes="[MarshalAs(UnmanagedType.CustomMarshaler, MarshalTypeRef = typeof(UTF8Marshaler))]", outattributes="[return: MarshalAs(UnmanagedType.CustomMarshaler, MarshalTypeRef = typeof(UTF8Marshaler))]") string "string"
and on line 61, replace this line:
%typemap(imtype) const string & "string"
with this line:
%typemap(imtype, inattributes="[MarshalAs(UnmanagedType.CustomMarshaler, MarshalTypeRef = typeof(UTF8Marshaler))]", outattributes="[return: MarshalAs(UnmanagedType.CustomMarshaler, MarshalTypeRef = typeof(UTF8Marshaler))]") string & "string"
Lo and behold, everything works. Read the linked article for a good understanding of how this works.