Related
I am using pybind11 to convert scipy's csr_matrix into a C++ object through pybind11. To this end, I've defined a Vector class, which is converted from the numpy data, indices, and indptr fields.
template<typename T>
class Vector {
public:
T *data;
const std::array<ssize_t, 1> shape;
Vector() = delete;
Vector(T *data, const std::array<ssize_t, 1> shape) : data(data), shape(shape) {};
};
I later register several instantiations of the class using a template function which does this.
py::class_<Vector<T>>(m, "...Vector", py::buffer_protocol())
.def("__init__", [](Vector<T> &v, py::array_t<T, py::array::c_style | py::array::forcecast> data) {
py::buffer_info info = data.request();
if (info.ndim != 1) throw std::invalid_argument("must be a 1d array!");
std::array<ssize_t, 1> shape_ = {info.shape[0]};
new(&v) Vector<T>(static_cast<T *>(info.ptr), shape_);
})
.def_buffer([](Vector<T> &v) -> py::buffer_info {
return py::buffer_info(
v.data, sizeof(T), py::format_descriptor<T>::format(), 1, v.shape, {sizeof(T)}
);
});
So the vector class works fine, however, I then define a csr_matrix class like so
template<typename T>
class csr_matrix {
public:
Vector<T> data;
Vector<ssize_t> indices;
Vector<ssize_t> indptr;
const std::array<ssize_t, 2> shape;
csr_matrix() = delete;
csr_matrix(Vector<T>& data, Vector<ssize_t>& indices, Vector<ssize_t>& indptr, const std::array<ssize_t, 2>& shape)
: data(data), indices(indices), indptr(indptr), shape(shape) {}
};
which is then registered in the same way as Vector using templates, so I can register csr_matrices for floats, doubles, and ints.
py::class_<csr_matrix<T>>(m, "...csr_matrix"))
.def("__init__", [](
csr_matrix<T> &matrix,
py::array_t<T, py::array::c_style> data,
py::array_t<ssize_t, py::array::c_style> indices,
py::array_t<ssize_t, py::array::c_style> indptr,
py::array_t<ssize_t, py::array::c_style | py::array::forcecast> shape
) {
py::buffer_info data_info = data.request();
py::buffer_info indices_info = indices.request();
py::buffer_info indptr_info = indptr.request();
// ... some validity checks
auto vec_data = new Vector<T>(static_cast<T *>(data_info.ptr), {data_info.shape[0]});
auto vec_indices = new Vector<ssize_t>(static_cast<ssize_t *>(indices_info.ptr), {indices_info.shape[0]});
auto vec_indptr = new Vector<ssize_t>(static_cast<ssize_t *>(indptr_info.ptr), {indptr_info.shape[0]});
std::array<ssize_t, 2> shape_ = {*shape.data(0), *shape.data(1)};
new(&matrix) csr_matrix<T>(*vec_data, *vec_indices, *vec_indptr, shape_);
})
.def_readonly("data", &csr_matrix<T>::data)
.def_readonly("indices", &csr_matrix<T>::indices)
.def_readonly("indptr", &csr_matrix<T>::indptr);
Now, I write a simple unit test in Python to make sure everything is working properly and I get the most baffling error
x = sp.csr_matrix([
[0, 0, 1, 2],
[0, 0, 0, 0],
[3, 0, 4, 0],
], dtype=np.float32)
cx = matrix.Float32CSRMatrix(x.data, x.indices, x.indptr, x.shape)
np.testing.assert_equal(x.data, np.asarray(cx.data, dtype=np.float32))
np.testing.assert_equal(x.indices, np.asarray(cx.indices, dtype=np.uint64))
np.testing.assert_equal(x.indptr, np.asarray(cx.indptr, dtype=np.uint64)) # fails here!
I throw a couple of prints after each line and this is the output
print(1, np.asarray(cx.data, dtype=np.float32), np.asarray(cx.indices, dtype=np.uint64), np.asarray(cx.indptr, dtype=np.uint64))
# data indices indptr
- [1. 2. 3. 4.] [2 3 0 2] [0 2 2 4] # original, python object
0 [1. 2. 3. 4.] [2 3 0 2] [0 2 2 4] # after matrix.Float32CSRMatrix(...)
1 [1. 2. 3. 4.] [2 3 0 2] [0 2 2 4] # after assert on data
2 [1. 2. 3. 4.] [2 3 0 2] [2 3 0 2] # after assert on indices
# Fails at assert on indptr!
So somewhere, something changed the values of indptr into indices, and I have no clue what and where. What's even more baffling is that if I change the order of the asserts so that indptr is checked before indices, like this
np.testing.assert_equal(x.data, np.asarray(cx.data, dtype=np.float32))
np.testing.assert_equal(x.indptr, np.asarray(cx.indptr, dtype=np.uint64))
np.testing.assert_equal(x.indices, np.asarray(cx.indices, dtype=np.uint64)) # fails here!
then this is the output
- [1. 2. 3. 4.] [2 3 0 2] [0 2 2 4]
0 [1. 2. 3. 4.] [2 3 0 2] [0 2 2 4]
1 [1. 2. 3. 4.] [2 3 0 2] [0 2 2 4]
2 [1. 2. 3. 4.] [0 2 2 4] [0 2 2 4]
# Now fails at assert indices; we do the assert on indptr before, and it passes
So now, it's indices that are being overridden with indptr, and not the other way around. I've been banging my head against this wall for well over a day now and I have no clue what's going on. Object lifetime is not an issue, vectors are constructed at the start and destructed when the csr_matrix goes away. And I have pasted here all the relevant code, and I am not doing anything that might cause this.
Any and all help would be greatly appreciated.
Say I have
A = [1 2 3]
[4 5 6]
[7 8 9]
I want to pad it with the first row and first column or last row and last column as many times as needed to create A nxn. For example, A 4x4 would be
A = [1 1 2 3]
[1 1 2 3]
[4 4 5 6]
[7 7 8 9]
and A 5x5 would be
A = [1 1 2 3 3]
[1 1 2 3 3]
[4 4 5 6 6]
[7 7 8 9 9]
[7 7 8 9 9]
I'm aware that I could do A.conservativeResize(4,4) which gets me
A = [1 2 3 0]
[4 5 6 0]
[7 8 9 0]
[0 0 0 0]
then I could copy things around one by one, but is there a more efficient way to do this using Eigen?
You can workaround using a nullary-expression:
#include <iostream>
#include <Eigen/Dense>
using namespace Eigen;
using namespace std;
int main()
{
Matrix3i A;
A.reshaped() = VectorXi::LinSpaced(9,1,9);
cout << A << "\n\n";
int N = 5;
MatrixXi B(N,N);
B = MatrixXi::NullaryExpr(N, N, [&A,N] (Index i,Index j) {
return A( std::max<Index>(0,i-(N-A.rows())),
std::max<Index>(0,j-(N-A.cols())) ); } );
cout << B << "\n\n";
}
Another approach would be to create a clamped sequence of indices like [0 0 0 1 2]:
struct pad {
Index size() const { return m_out_size; }
Index operator[] (Index i) const { return std::max<Index>(0,i-(m_out_size-m_in_size)); }
Index m_in_size, m_out_size;
};
B = A(pad{3,N}, pad{3,N});
This version requires the head of Eigen.
You can easily build on those examples to make them even more general and/or wrap them within functions.
Just as a note, it's not true that A.conservativeResize(4,4) will get you a matrix with the added rows filled with zeros. The Eigen documentation says,
In case values need to be appended to the matrix they will be uninitialized.
The new rows and columns will be filled with garbage, and seeing zeros is only a coincidence (unless you are compiling with a special preprocessor directive to Eigen). But this means that no unnecessary time is wasted writing zeros that you will overwrite anyway.
Note: this code demonstrates how to get a matrix with your original matrix in the top left corner:
The best way to fill multiple values at once is to use Eigen's block operations and setConstant. For example, if A is a matrix of size old_sizexold_size:
A.conservativeResize(n, n);
for (int i = 0; i < n; ++i) {
// Fill the end of each row and column
A.row(i).tail(n - old_size).setConstant(A(i, old_size - 1));
A.col(i).tail(n - old_size).setConstant(A(old_size - 1, i));
}
// Fill the bottom right block
A.bottomRightCorner(n - old_size, n - old_size).setConstant(A(old_size - 1, old_size - 1));
More importantly than being "efficient", these functions express your intent as a programmer.
Edit: To get a padded matrix with your original matrix in the middle:
I just noticed your example pads around the original matrix in the middle, not in the top left. In this case, there is little point to using conservativeResize(), because the original values will only be copied to the top left corner. An outline of the solution is:
Construct a new nxn matrix B of the desired size
Copy your original matrix to the middle using
int start = (n - old_size + 1)/2;
B.block(start, start, old_size, old_size) = A;
Fill in the outside values using block operations similar to my example above.
I am working on a problem that requires iterating over all combinations of elements of K vectors taken one at a time. So for example for K=2 vectors v1 = [0 1] and v2 = [3 4], I would iterate over (0,3), (0,4), (1,3), (1,4).
Since K is determined at run-time, I cannot use explicit for loops. My current approach is based on this solution that implements an "odometer" incrementing an index for each vector.
#include <vector>
#include <iostream>
int main(int argc, char * argv[])
{
std::vector<int> v1( {1, 2, 3} );
std::vector<int> v2( {-2, 5} );
std::vector<int> v3( {0, 1, 2} );
std::vector<std::vector<int> > vv( {v1, v2 ,v3} );
// Iterate combinations of elems in v1, v2, v3, one at a time
std::vector<std::vector<int>::iterator> vit;
for (auto& v : vv)
vit.push_back(v.begin());
int K = vv.size();
while (vit[0] != vv[0].end())
{
std::cout << "Processing combination: [";
for (auto& i : vit)
std::cout << *i << " ";
std::cout << "]\n";
// increment "odometer" by 1
++vit[K-1];
for (int i = K-1; (i > 0) && (vit[i] == vv[i].end()); --i)
{
vit[i] = vv[i].begin();
++vit[i-1];
}
}
return 0;
}
Output:
Processing combination: [1 -2 0 ]
Processing combination: [1 -2 1 ]
Processing combination: [1 -2 2 ]
Processing combination: [1 5 0 ]
Processing combination: [1 5 1 ]
Processing combination: [1 5 2 ]
Processing combination: [2 -2 0 ]
Processing combination: [2 -2 1 ]
Processing combination: [2 -2 2 ]
Processing combination: [2 5 0 ]
Processing combination: [2 5 1 ]
Processing combination: [2 5 2 ]
Processing combination: [3 -2 0 ]
Processing combination: [3 -2 1 ]
Processing combination: [3 -2 2 ]
Processing combination: [3 5 0 ]
Processing combination: [3 5 1 ]
Processing combination: [3 5 2 ]
However, this is somewhat messy and requires a lot of boilerplate code that I'd rather move elsewhere for clarity. Ideally I would like to have a custom iterator class, say my_combination_iterator, that would allow me to do things much cleaner, e.g.:
for (my_combination_iterator it = vv.begin(); it != vv.end(); ++it)
// process combination
So far, I have looked at Boost iterator_facade. But my case seems more complicated than the one in the tutorial since I would need an iterator over a vector of Values as opposed to a single value type to define the required operators for the custom iterator.
How could such an iterator be implemented?
Why would you like to use custom iterators?
One could instead implement a very simple class that will iterate through all combinations:
class Combinator
{
public:
Combinator(std::vector<std::vector<int> >& vectors)
: m_vectors(vectors)
{
m_combination.reserve(m_vectors.size());
for(auto& v : m_vectors)
m_combination.push_back(v.begin());
}
bool next()
{
// iterate through vectors in reverse order
for(long long i = m_vectors.size() - 1; i >= 0; --i)
{
std::vector<int>& v = m_vectors[i];
std::vector<int>::iterator& it = m_combination[i];
if(++it != v.end())
return true;
it = v.begin();
}
return false;
}
std::vector<std::vector<int>::iterator> combination() const
{
return m_combination;
}
private:
std::vector<std::vector<int> >& m_vectors; // reference to data
std::vector<std::vector<int>::iterator> m_combination;
};
Live Demo
UPDATE:
If you would still like to use iterators, I suggest iterating over combinations. One can put all the combinations from Combinator into a container and then work with container's own iterators. In my opinion it's a cleaner solution. The only drawback is the extra-memory needed to store all combinations explicitly.
I have a 3 dimensional numpy array, (z, x, y). z is a time dimension and x and y are coordinates.
I want to convert this to a multiindexed pandas.DataFrame. I want the row index to be the z dimension
and each column to have values from a unique x, y coordinate (and so, each column would be multi-indexed).
The simplest case (not multi-indexed):
>>> array.shape
(500L, 120L, 100L)
>>> df = pd.DataFrame(array[:,0,0])
>>> df.shape
(500, 1)
I've been trying to pass the whole array into a multiindex dataframe using pd.MultiIndex.from_arrays but I'm getting an error:
NotImplementedError: > 1 ndim Categorical are not supported at this time
Looks like it should be fairly simple but I cant figure it out.
I find that a Series with a Multiindex is the most analagous pandas datatype for a numpy array with arbitrarily many dimensions (presumably 3 or more).
Here is some example code:
import pandas as pd
import numpy as np
time_vals = np.linspace(1, 50, 50)
x_vals = np.linspace(-5, 6, 12)
y_vals = np.linspace(-4, 5, 10)
measurements = np.random.rand(50,12,10)
#setup multiindex
mi = pd.MultiIndex.from_product([time_vals, x_vals, y_vals], names=['time', 'x', 'y'])
#connect multiindex to data and save as multiindexed Series
sr_multi = pd.Series(index=mi, data=measurements.flatten())
#pull out a dataframe of x, y at time=22
sr_multi.xs(22, level='time').unstack(level=0)
#pull out a dataframe of y, time at x=3
sr_multi.xs(3, level='x').unstack(level=1)
I think you can use panel - and then for Multiindex DataFrame add to_frame:
np.random.seed(10)
arr = np.random.randint(10, size=(5,3,2))
print (arr)
[[[9 4]
[0 1]
[9 0]]
[[1 8]
[9 0]
[8 6]]
[[4 3]
[0 4]
[6 8]]
[[1 8]
[4 1]
[3 6]]
[[5 3]
[9 6]
[9 1]]]
df = pd.Panel(arr).to_frame()
print (df)
0 1 2 3 4
major minor
0 0 9 1 4 1 5
1 4 8 3 8 3
1 0 0 9 0 4 9
1 1 0 4 1 6
2 0 9 8 6 3 9
1 0 6 8 6 1
Also transpose can be useful:
df = pd.Panel(arr).transpose(1,2,0).to_frame()
print (df)
0 1 2
major minor
0 0 9 0 9
1 1 9 8
2 4 0 6
3 1 4 3
4 5 9 9
1 0 4 1 0
1 8 0 6
2 3 4 8
3 8 1 6
4 3 6 1
Another possible solution with concat:
arr = arr.transpose(1,2,0)
df = pd.concat([pd.DataFrame(x) for x in arr], keys=np.arange(arr.shape[2]))
print (df)
0 1 2 3 4
0 0 9 1 4 1 5
1 4 8 3 8 3
1 0 0 9 0 4 9
1 1 0 4 1 6
2 0 9 8 6 3 9
1 0 6 8 6 1
np.random.seed(10)
arr = np.random.randint(10, size=(500,120,100))
df = pd.Panel(arr).transpose(2,0,1).to_frame()
print (df.shape)
(60000, 100)
print (df.index.max())
(499, 119)
I have go a problem with function in SML. This function should return list index of number which will not be summed, but was taken to sum.
A call of a function: index(10, [1,2,3,4,5,6,7])
Result should be 3
(10 is a sum of numbers, we seek an index from the list which gives us 10, e.g:
1+2+3=6, 1+2+3+4=10, and return previuos one)
fun index (sum : int, numbers : int list) =
if null numbers
then 0
else if hd(numbers) > sum
then 0
else 1 + index(sum, (hd(numbers)+(hd(tl numbers)))::(tl numbers))
It seems to work, but result is wrong.
Function increments the result every two calling even if it should not.
Can anybody tell me how to fix this?
You're almost there. While I agree with #koodawg that adding a counter and a running total is another solution for this problem, having those in your code will complicate it more than it needs to be.
First, I have a few comments about your code. You must remove the unnecessary parens. hd(numbers) is same as hd numbers and (hd(tl numbers)) is equal to hd(tl numbers). So your (hd(numbers)+(hd(tl numbers))) could be simplified to (hd numbers + hd(tl numbers)). Also, you can combine if null numbers and if hd(numbers) > sum in a single condition for code brevity since they yield the same result: 0.
I'll try to explain how code works and I hope you'll get the idea where you have to amend your code.
Using your example, index(10, [1,2,3,4,5,6,7]), your code execution will be like this:
1)
fun index(10, [1,2,3,4,5,6,7]) =
if 1 > 10
then 0
else 1 + (10, [1 + 2] append to [2,3,4,5,6,7])
new list: [3,2,3,4,5,6,7]
result: 1
2)
fun index(10, [3,2,3,4,5,6,7]) =
if 3 > 10
then 0
else 1 + (10, [3 + 2] append to [2,3,4,5,6,7])
new list: [5,2,3,4,5,6,7]
result: 1
3)
fun index(10, [5,2,3,4,5,6,7]) =
if 5 > 10
then 0
else 1 + (10, [5 + 2] append to [2,3,4,5,6,7])
new list: [7,2,3,4,5,6,7]
result: 1
4)
fun index(10, [7,2,3,4,5,6,7]) =
if 7 > 10
then 0
else 1 + (10, [7 + 2] append to [2,3,4,5,6,7])
new list: [9,2,3,4,5,6,7]
result: 1
5)
fun index(10, [9,2,3,4,5,6,7]) =
if 9 > 10
then 0
else 1 + (10, [9 + 2] append to [2,3,4,5,6,7])
new list: [11,2,3,4,5,6,7]
result: 1
6)
fun index(10, [11,2,3,4,5,6,7]) =
if 11 > 10
then 0
result: 0
To sum all results: 1 + 1 + 1 + 1 + 1 + 0 = 5 (just like what you said that your function adds 2 to the expected result)
The correct code must behave like this:
1)
fun index(10, [1,2,3,4,5,6,7]) =
if 1 > 10
then 0
else 1 + (10, [1 + 2] append to [3,4,5,6,7])
new list: [3,3,4,5,6,7]
result: 1
2)
fun index(10, [3,3,4,5,6,7]) =
if 3 > 10
then 0
else 1 + (10, [3 + 3] append to [4,5,6,7])
new list: [6,4,5,6,7]
result: 1
3)
fun index(10, [6,4,5,6,7]) =
if 6 > 10
then 0
else 1 + (10, [6 + 4] append to [5,6,7])
new list: [10,5,6,7]
result: 1
4)
fun index(10, [10,5,6,7]) =
if 10 > 10
then 0
result: 0
To sum all results: 1 + 1 + 1 + 0 = 3 which is the expected answer.
HINT: You always make sure that the new list your function is processing must be smaller than the previous list/original list.
I hope I explained clearly why your code isn't working. I didn't include the code because I know this is a homework for an online class.
You need to keep a counter and total. Counter that increments with every recursive call, total equal to sum of each hd(numbers) as you go, then return the counter when your total > sum.
Something like this;
if (total + hd numbers) >= sum
then counter
else recursivecall(total + hd numbers, tl numbers, counter + 1)