trim np arrays according to a list of starting points - list

I have a table, represented by an np.array like the following:
A = [[12,412,42,54],
And a list that contains the desired starting point of each row in A:
The desired output would be:
B = [[12,412,42,54],
I prefer to avoid using a for-loop for obvious reasons.

Try compare the L with column index, then use boolean set/get items:
# convert A to numpy array for advanced indexing
A = np.array(A)
ll = A.shape[1]
keep = np.arange(ll) >= np.array(L)[:,None]
out = np.full(A.shape, np.nan)
out[keep[:,::-1]] = A[keep]
[[ 12. 412. 42. 54.]
[ 42. 4. nan nan]
[ 43. 22. 10. nan]]

My guess would be that a vectorized approach for this would be less efficient than explicit looping, because the result is fundamentally a jagged array, which NumPy does not support well.
However, a loop-based solution is simple, that can be made faster with Numba's nb.njit(), if needed.:
import numpy as np
import numba as nb
def jag_nb(arr, starts, empty=np.nan):
result = np.full(arr.shape, empty)
for i, x in enumerate(starts):
if x != 0:
result[i, :-x] = arr[i, x:]
result[i, :] = arr[i, :]
return result
A = np.array([[12,412,42,54], [144,2,42,4], [2,43,22,10]])
L = np.array([0,2,1])
jag(A, L)
# array([[ 12., 412., 42., 54.],
# [ 42., 4., nan, nan],
# [ 43., 22., 10., nan]])
Compared to the pure NumPy vectorized approach proposed in #QuangHoang's answer:
def jag_np(arr, starts, empty=np.nan):
m, _ = arr.shape
keep = np.arange(m) >= starts[:, None]
result = np.full(arr.shape, np.nan)
result[keep[:, ::-1]] = arr[keep]
return result
The Numba based approach is noticeably faster, as shown with the following benchmarks:
import pandas as pd
import matplotlib.pyplot as plt
def benchmark(
ii=range(4, 10, 1),
is_equal=lambda x, y: np.allclose(x, y, equal_nan=True),
labels = [func.__name__ for func in funcs]
units = {"s": 0, "ms": 3, "µs": 6, "ns": 9}
assert unit in units
timings = {}
for i in ii:
m = n = 2 ** i
if verbose:
print(f"i={i}, n={n}")
arr = np.random.random((m, n))
starts = np.random.randint(0, n, m)
base = funcs[0](arr, starts)
timings[n] = []
for func in funcs:
res = func(arr, starts)
is_good = is_equal(base, res)
timed = %timeit -n 64 -r 8 -q -o func(arr, starts)
timing =
timings[n].append(timing if is_good else None)
if verbose:
f" {is_good!s:5}"
f" {timing * (10 ** units[unit]):10.3f} {unit}"
f" {timings[n][0] / timing:5.1f}x")
return timings, labels
def plot(timings, labels, title=None, xlabel="Input Size / #", unit="ms"):
n_rows = 1
n_cols = 3
fig, axs = plt.subplots(n_rows, n_cols, figsize=(8 * n_cols, 6 * n_rows), squeeze=False)
units = {"s": 0, "ms": 3, "µs": 6, "ns": 9}
df = pd.DataFrame(data=timings, index=labels).transpose()
base = df[[labels[0]]].to_numpy()
(df * 10 ** units[unit]).plot(marker="o", xlabel=xlabel, ylabel=f"Best timing / {unit}", ax=axs[0, 0])
(df / base * 100).plot(marker='o', xlabel=xlabel, ylabel='Relative speed / %', logx=True, ax=axs[0, 1])
(base / df).plot(marker='o', xlabel=xlabel, ylabel='Speed Gain / x', ax=axs[0, 2])
if title:
funcs = jag_np, jag_nb
timings, labels = benchmark(funcs, ii=range(4, 11))
plot(timings, labels, unit="ms")


Is there any all_coeffs() for multivariable polynomials in sympy?

I want to extract all coefficients (INCLUDING ZEROS) of a multariable polynomial using sympy.
Sympy offers all_coeffs() but it only works for univariate. Otherwise I get this error PolynomialError: multivariate polynomials not supported
For example for a polynomial x^3+y^3+x*y+1 I woud like the output to be [3,3,0,0,0,0,1,0,0,1]
If you make the monomials of interest then you can see what their coefficients are in your expression. You have to watch out for requesting a monomial coefficient of x*y in an expression with terms like x*y*z, however. The following routine handles that by zeroing out any variables in the coefficient that is obtained. It also has a routine to create monomials of interest.
def all_coeffs(expr,*free):
x = IndexedBase('x')
expr = expr.expand()
free = list(free) or list(expr.free_symbols)
pows = [p.as_base_exp() for p in expr.atoms(Pow,Symbol)]
P = {}
for p,e in pows:
if p not in free:
elif p not in P:
elif e>P[p]:
P[p] = e
reps = dict([(f, x[i]) for i,f in enumerate(free)])
xzero = dict([(v,0) for k,v in reps.items()])
e = expr.xreplace(reps); reps = {v:k for k,v in reps.items()}
return dict([(m.xreplace(reps), e.coeff(m).xreplace(xzero) if m!=1 else e.xreplace(xzero)) for m in monoms(*[P[f] for f in free])])
def monoms(*o):
x = IndexedBase('x')
f = []
for i,o in enumerate(o):
return Mul(*f).expand().args
>>> eq = x**2 + x*y - 3
>>> all_coeffs(eq)
{1: -3, x**2: 1, x**2*y: 0, x*y: 1, y: 0, x: 0}
>>> all_coeffs(eq, x)
{1: -3, x**2: 1, x: y}

Python - Error using linprog ("Invalid input for linprog: provide a 3 x 2 array for bounds, not a 2 x 3 array")

I am trying to use the linprog in python to solve this problem:
# Minimize = (0.035*x1) + (0.015*x2) + (0.025*x3)
# x1+x2+x3=1.2
# 0<=x1<=0.7
# 0<=x2<=0.3
# 0<=x3<=0.5
c = [0.035, 0.015, 0.025] #objective function
A_eq = [[1, 1, 1]]
b = [1.2]
lb = (0, 0, 0)
up = (0.7, 0.3, 0.5)
from scipy.optimize import linprog
linprog(c, A_ub=None, b_ub=None, A_eq=A_eq, b_eq=b, bounds=[lb,up], method='interior-point', callback=None, options=None, x0=None)
However I am getting an error could you help me with that?
thanks a lot!
You sholud define correctly the bounds for each variable in the same order as the coefficients. In this case, they’re between zero and some number:
# Minimize = (0.035*x1) + (0.015*x2) + (0.025*x3)
# x1+x2+x3=1.2
# 0<=x1<=0.7
# 0<=x2<=0.3
# 0<=x3<=0.5
c = [0.035, 0.015, 0.025] #objective function
A_eq = [[1, 1, 1]]
b = [1.2]
x1_b = (0, 0.7)
x2_b = (0, 0.3)
x3_b = (0, 0.5)
from scipy.optimize import linprog
linprog(c, A_ub=None, b_ub=None, A_eq=A_eq, b_eq=b, bounds=[x1_b, x2_b,x3_b], method='interior-point', callback=None, options=None, x0=None)

How to set parameters for lightgbm when using customized objective function for multi-class classification?

I want to test a customized objective function for lightgbm in multi-class classification.
I have specified the parameter "num_class=3".
However, an error: "
Number of classes must be 1 for non-multiclass training" is thrown
I am using python 3.6 and lightgbm version 0.2
# iris data
from sklearn import datasets
import lightgbm as lgb
import numpy as np
iris = datasets.load_iris()
X = iris['data']
y = iris['target']
# construct train-test
num_train = int(X.shape[0] / 3 * 2)
idx = np.random.permutation(X.shape[0])
x_train = X[idx[:num_train]]
x_test = X[idx[num_train:]]
y_train = y[idx[:num_train]]
y_test = y[idx[num_train:]]
# softmax function
def softmax(x):
input x: an np.array of n_sample * n_class
return : an np.array of n_sample * n_class (probabilities)
x = np.where(x>100, 100, x)
x = np.exp(x)
return x / np.reshape(np.sum(x, 1), [x.shape[0], 1])
# objective function
def objective(y_true, y_pred):
y_true: np.array of size (n_sample,)
y_pred: np.array of size (n_sample, n_class)
y_pred = softmax(y_pred)
temp = np.zeros_like(y_pred)
temp[range(y_pred.shape[0]), y_true] = 1
gradient = y_pred - temp
hessian = y_pred * (1 - y_pred)
return [gradient, hessian]
# lightgbm model
model = lgb.LGBMClassifier(n_estimators=10000,
num_classes = 3,
objective = objective,
nthread=4), y_train,
eval_metric = 'multi_logloss',
eval_set = [(x_test, y_test), (x_train, y_train)],
eval_names = ['valid', 'train'],
early_stopping_rounds = 200, verbose = 100)
Let me answer my own question.
The arguments in the objective function should be:
y_true of size [n_sample, ]
y_pred of size [n_sample * n_class, ] instead of [n_sample, n_class]
To be more specific, y_pred should be like
y_pred = [first_class, first_class,..., second_class, second_class,..., third_class, third_class,...]
Moreover, gradient and hessian should be grouped in the same way.
def objective(y_true, y_pred):
y_true: np.array of size [n_sample,]
y_pred: np.array of size [n_sample * n_class, ]
gradient and hessian should have exactly the same form of y_pred
y_pred = np.reshape(y_pred, [num_train, 3], order = 'F')
y_pred = softmax(y_pred)
temp = np.zeros_like(y_pred)
temp[range(y_pred.shape[0]), y_true] = 1
gradient = y_pred - temp
hessian = y_pred * (1 - y_pred)
return [gradient.ravel(order = 'F'), hessian.ravel(order = 'F')]

Calculate two dimensional pairwise distance on a large numpy three dimensional array

I have a numpy array of 3 million points in the form of [pt_id, x, y, z]. The goal is to return all pairs of points that have an Euclidean distance two numbers min_d and max_d.
The Euclidean distance is between x and y and not on the z. However, I'd like to preserve the array with pt_id_from, pt_id_to, distance attributes.
I'm using scipy's dist to calculate the distances:
import scipy.spatial.distance
coords_arr = np.array([['pt1', 2452130.000, 7278106.000, 25.000],
['pt2', 2479539.000, 7287455.000, 4.900],
['pt3', 2479626.000, 7287458.000, 10.000],
['pt4', 2484097.000, 7292784.000, 8.800],
['pt5', 2484106.000, 7293079.000, 7.300],
['pt6', 2484095.000, 7292891.000, 11.100]])
dists = scipy.spatial.distance.pdist(coords_arr[:,1:3], 'euclidean')
np.savetxt('test.out', scipy.spatial.distance.squareform(dists), delimiter=',')
What should I do to return an array of form: [pt_id_from, pt_id_to, distance]?
You simply create a new array from the data by looping through all the possible combinations. The itertools module is excellent for this.
n = coords_arr.shape[0] # number of points
D = scipy.spatial.distance.squareform(dists) # distance matrix
data = []
for i, j in itertools.combinations(range(n), 2):
pt_a = coords_arr[i, 0]
pt_b = coords_arr[j, 0]
d_ab = D[i,j]
data.append([pt_a, pt_b, d_ab])
result_arr = np.array(data)
If memory is a problem, you might want to change the distance lookup from using the huge matrix D to looking up the value directly in dists using the i and j index.
Well, ['pt1', 'pt2', distance_as_number] is not exactly possible. The closest you can get with mixed datatypes is a structured array but then you can't do things like result[:2,0]. You'll have to index field names and array indices separately like: result[['a','b']][0].
Here is my solution:
import numpy as np
import scipy.spatial.distance
coords_arr = np.array([['pt1', 2452130.000, 7278106.000, 25.000],
['pt2', 2479539.000, 7287455.000, 4.900],
['pt3', 2479626.000, 7287458.000, 10.000],
['pt4', 2484097.000, 7292784.000, 8.800],
['pt5', 2484106.000, 7293079.000, 7.300],
['pt6', 2484095.000, 7292891.000, 11.100]])
dists = scipy.spatial.distance.pdist(coords_arr[:,1:3], 'euclidean')
# Create a shortcut for `coords_arr.shape[0]` which is basically
# the total amount of points, hence `n`
n = coords_arr.shape[0]
# `a` and `b` contain the indices of the points which were used to compute the
# distances in dists. In this example:
# a = [0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4]
# b = [1, 2, 3, 4, 5, 2, 3, 4, 5, 3, 4, 5, 4, 5, 5]
a = np.arange(n).repeat(np.arange(n-1, -1, -1))
b = np.hstack([range(x, n) for x in xrange(1, n)])
min_d = 1000
max_d = 10000
# Find out which distances are in range.
in_range = np.less_equal(min_d, dists) & np.less_equal(dists, max_d)
# Define the datatype of the structured array which will be the result.
dtype = [('a', '<f8', (3,)), ('b', '<f8', (3,)), ('dist', '<f8')]
# Create an empty array. We fill it later because it makes the code cleaner.
# Its size is given by the sum over `in_range` which is possible
# since True and False are equivalent to 1 and 0.
result = np.empty(np.sum(in_range), dtype=dtype)
# Fill the resulting array.
result['a'] = coords_arr[a[in_range], 1:4]
result['b'] = coords_arr[b[in_range], 1:4]
result['dist'] = dists[in_range]
# In caste you don't want a structured array at all, this is what you can do:
result = np.hstack([coords_arr[a[in_range],1:],
dists[in_range, None]]).astype('<f8')
The structured array:
[([2479539.0, 7287455.0, 4.9], [2484097.0, 7292784.0, 8.8], 7012.389393067102)
([2479539.0, 7287455.0, 4.9], [2484106.0, 7293079.0, 7.3], 7244.7819152821985)
([2479539.0, 7287455.0, 4.9], [2484095.0, 7292891.0, 11.1], 7092.75912462844)
([2479626.0, 7287458.0, 10.0], [2484097.0, 7292784.0, 8.8], 6953.856268287403)
([2479626.0, 7287458.0, 10.0], [2484106.0, 7293079.0, 7.3], 7187.909362255481)
([2479626.0, 7287458.0, 10.0], [2484095.0, 7292891.0, 11.1], 7034.873843929257)]
The ndarray:
[[2479539.0, 7287455.0, 4.9, 2484097.0, 7292784.0, 8.8, 7012.3893],
[2479539.0, 7287455.0, 4.9, 2484106.0, 7293079.0, 7.3, 7244.7819],
[2479539.0, 7287455.0, 4.9, 2484095.0, 7292891.0, 11.1, 7092.7591],
[2479626.0, 7287458.0, 10.0, 2484097.0, 7292784.0, 8.8, 6953.8562],
[2479626.0, 7287458.0, 10.0, 2484106.0, 7293079.0, 7.3, 7187.9093],
[2479626.0, 7287458.0, 10.0, 2484095.0, 7292891.0, 11.1, 7034.8738]]
You can use np.where to get a coords of distances within a range, then generate a new list in your format, filtering same pairs. Like this:
>>> import scipy.spatial.distance
>>> import numpy as np
>>> coords_arr = np.array([['pt1', 2452130.000, 7278106.000, 25.000],
... ['pt2', 2479539.000, 7287455.000, 4.900],
... ['pt3', 2479626.000, 7287458.000, 10.000],
... ['pt4', 2484097.000, 7292784.000, 8.800],
... ['pt5', 2484106.000, 7293079.000, 7.300],
... ['pt6', 2484095.000, 7292891.000, 11.100]])
>>> dists = scipy.spatial.distance.pdist(coords_arr[:,1:3], 'euclidean')
>>> dists = scipy.spatial.distance.squareform(dists)
>>> x, y = np.where((dists >= 8000) & (dists <= 30000))
>>> [(coords_arr[x[i]][0], coords_arr[y[i]][0], dists[y[i]][x[i]]) for i in xrange(len(x)) if x[i] < y[i]]
[('pt1', 'pt2', 28959.576688895162), ('pt1', 'pt3', 29042.897927032005)]

defined function not found

I have a script as follows:
import numpy as np
import pandas as pd
import pdb
# conventions: W = fitness, A = affinity ; sex: 1=M, 0=F; alien: 1=alien,
# 0=native
# pop array order: W, A, sex, alien
def mkpop(n):
W = np.repeat(a=1, repeats=n)
A = np.random.normal(1, 0.1, size=n)
A[A < 0] = 0
alien = np.repeat(a=False, repeats=n)
sex = np.random.randint(0, 2, n)
pop = np.array([W, A, sex, alien])
pop = np.transpose(pop)
return pop
def migrate(pop, n=10, gParams=[1, 0.1]):
W = np.random.gamma(shape=gParams[0], scale=gParams[1], size=n)
A = np.repeat(1, n)
# 0 is native; 1 is alien
alien = np.repeat(True, n)
# 0 is female
sex = np.random.randint(0, 2, n)
popAlien = np.array([W, A, sex, alien])
popAlien = np.transpose(popAlien)
pop = np.vstack((pop, popAlien))
return pop
def mate(pop):
# split into male and female
f = pop[pop[:, 2] == 0]
m = pop[pop[:, 2] == 1]
# create transition matricies for native and alien mates
# m with native = m.!alien.transpose * f.alien
# negate alien
naLog = list(np.asarray(m[:, 3]) == False)
naPdMat = np.outer(naLog, f[:, 1])
# mate with alien = m.alien.transpose * affinity
alPdMat = np.outer(m[:, 3], f[:, 1])
# add transition matrices for probability density matrix
pdMat = alPdMat + naPdMat
# transition matrix is equal to the pd matrix / column sumso
colSums = np.sum(pdMat, axis=0)
pMat = pdMat / colSums
# select mates
def choice(x):
ch = np.random.choice(a=range(0, len(x)), p=x)
return ch
mCh = np.apply_along_axis(choice, 0, pMat)
mCh = m[mCh, :]
WMid = (f[:, 0] + mCh[:, 0]) / 2
AMid = (f[:, 1] + mCh[:, 1]) / 2
# assign fitness based on group affiliation; only native/alien matings have
# modified fitness
# reassign fitness and affinity based on group id and midparent vals
W1 = np.where(
(f[:, 3] == mCh[:, 3]) |
((f[:, 3] == 1) & (mCh[:, 3] == 0))
WMid[W1] = 1
# number of offspring is a poisson-distributed variable with lambda=2W
nOff = map(lambda x: np.random.poisson(lam=x), 2 * WMid)
# generate offspring
# expand list of nOff to numbers of offspring per pair
# realized offspring is index posisions of W and A vals to be replicated
# for offspring
# this can be rewritten to return a matrix of the appropriate length. This
# should work
midVals = np.array([WMid, AMid]).T
realOff = np.array([0, 0])
for i in range(0, len(nOff)):
sibs = np.repeat([np.array(midVals[i])], [nOff[i]], axis=0)
realOff = np.vstack((realOff, sibs))
offspring = np.delete(realOff, 0, 0)
sex = np.random.randint(0, 2, len(offspring))
alien = np.repeat(0, len(offspring))
otherStats = np.array([sex, alien]).T
offspring = np.hstack([offspring, otherStats])
return offspring # should return offspring
def sim(nInit, nGen=100, nAlien=10, gParams=[1, 0.1]):
gen = 0
pop = mkpop
stats = pd.DataFrame(columns=('gen', 'W', 'WMean', 'AMean', 'WVar', 'AVar'))
while gen < nGen:
pop = migrate(pop, nAlien, gParams)
offspring = mate(pop)
var = np.var(offspring, axis=0)
mean = np.mean(offspring, axis=0)
N = len(offspring)
W = N / nInit
genStats = N.append(W, gen, mean, var)
stats = stats.append(genStats)
print(N, gen)
gen = gen + 1
return stats
print mkpop(100)
print mate(mkpop(100))
sim(100, 100, 10, [1, 0.1])
Running this script, outputs NameError: name 'sim' is not defined. It is apparent from the commands before the final one that all the other functions defined within this script work without a hitch. I'm not sure what is going on here, and there is probably some very easy fix that I'm overlooking. Ctags recognizes this function just fine. It's entirely possibe that sim() doesn't actually work yet, as I haven't been able to debug it.
Your sim function defined in mate function scope so it's invisible to global scope. You need to fix your indentation for sim function