I have a table, represented by an np.array like the following:
A = [[12,412,42,54],
[144,2,42,4],
[2,43,22,10]]
And a list that contains the desired starting point of each row in A:
L=[0,2,1]
The desired output would be:
B = [[12,412,42,54],
[42,4,np.nan,np.nan],
[43,22,10,np.nan]]
Edit
I prefer to avoid using a for-loop for obvious reasons.
Try compare the L with column index, then use boolean set/get items:
# convert A to numpy array for advanced indexing
A = np.array(A)
ll = A.shape[1]
keep = np.arange(ll) >= np.array(L)[:,None]
out = np.full(A.shape, np.nan)
out[keep[:,::-1]] = A[keep]
print(out)
Output:
[[ 12. 412. 42. 54.]
[ 42. 4. nan nan]
[ 43. 22. 10. nan]]
My guess would be that a vectorized approach for this would be less efficient than explicit looping, because the result is fundamentally a jagged array, which NumPy does not support well.
However, a loop-based solution is simple, that can be made faster with Numba's nb.njit(), if needed.:
import numpy as np
import numba as nb
#nb.njit
def jag_nb(arr, starts, empty=np.nan):
result = np.full(arr.shape, empty)
for i, x in enumerate(starts):
if x != 0:
result[i, :-x] = arr[i, x:]
else:
result[i, :] = arr[i, :]
return result
A = np.array([[12,412,42,54], [144,2,42,4], [2,43,22,10]])
L = np.array([0,2,1])
jag(A, L)
# array([[ 12., 412., 42., 54.],
# [ 42., 4., nan, nan],
# [ 43., 22., 10., nan]])
Compared to the pure NumPy vectorized approach proposed in #QuangHoang's answer:
def jag_np(arr, starts, empty=np.nan):
m, _ = arr.shape
keep = np.arange(m) >= starts[:, None]
result = np.full(arr.shape, np.nan)
result[keep[:, ::-1]] = arr[keep]
return result
The Numba based approach is noticeably faster, as shown with the following benchmarks:
import pandas as pd
import matplotlib.pyplot as plt
def benchmark(
funcs,
ii=range(4, 10, 1),
is_equal=lambda x, y: np.allclose(x, y, equal_nan=True),
seed=0,
unit="ms",
verbose=True,
use_str=True
):
labels = [func.__name__ for func in funcs]
units = {"s": 0, "ms": 3, "µs": 6, "ns": 9}
assert unit in units
np.random.seed(seed)
timings = {}
for i in ii:
m = n = 2 ** i
if verbose:
print(f"i={i}, n={n}")
arr = np.random.random((m, n))
starts = np.random.randint(0, n, m)
base = funcs[0](arr, starts)
timings[n] = []
for func in funcs:
res = func(arr, starts)
is_good = is_equal(base, res)
timed = %timeit -n 64 -r 8 -q -o func(arr, starts)
timing = timed.best
timings[n].append(timing if is_good else None)
if verbose:
print(
f"{func.__name__:>24}"
f" {is_good!s:5}"
f" {timing * (10 ** units[unit]):10.3f} {unit}"
f" {timings[n][0] / timing:5.1f}x")
return timings, labels
def plot(timings, labels, title=None, xlabel="Input Size / #", unit="ms"):
n_rows = 1
n_cols = 3
fig, axs = plt.subplots(n_rows, n_cols, figsize=(8 * n_cols, 6 * n_rows), squeeze=False)
units = {"s": 0, "ms": 3, "µs": 6, "ns": 9}
df = pd.DataFrame(data=timings, index=labels).transpose()
base = df[[labels[0]]].to_numpy()
(df * 10 ** units[unit]).plot(marker="o", xlabel=xlabel, ylabel=f"Best timing / {unit}", ax=axs[0, 0])
(df / base * 100).plot(marker='o', xlabel=xlabel, ylabel='Relative speed / %', logx=True, ax=axs[0, 1])
(base / df).plot(marker='o', xlabel=xlabel, ylabel='Speed Gain / x', ax=axs[0, 2])
if title:
fig.suptitle(title)
fig.patch.set_facecolor('white')
funcs = jag_np, jag_nb
timings, labels = benchmark(funcs, ii=range(4, 11))
plot(timings, labels, unit="ms")
I wanna get all integer solutions in a limited time, is it possible?
This is a linear, integer constraint satisfaction problem, which can be solved efficiently by OR Tools' CP-SAT. I've modified their example to solve your problem in Python:
from ortools.sat.python import cp_model
class VarArraySolutionPrinter(cp_model.CpSolverSolutionCallback):
"""Print intermediate solutions."""
def __init__(self, variables):
cp_model.CpSolverSolutionCallback.__init__(self)
self.__variables = variables
self.__solution_count = 0
def on_solution_callback(self):
self.__solution_count += 1
for v in self.__variables:
print('%s=%i' % (v, self.Value(v)), end=' ')
print()
def solution_count(self):
return self.__solution_count
def SearchForAllSolutionsSampleSat():
"""Showcases calling the solver to search for all solutions."""
# Creates the model.
model = cp_model.CpModel()
p = [1, 2, 3, 4]
ceq = 30
cgeq = 2
N = len(p)
# Creates the variables
x = [model.NewIntVar(0, 100, f'x{i}') for i in range(N)]
# Create the constraints.
model.Add(sum([xi*pi for xi, pi in zip(x, p)]) == ceq)
model.Add(sum(x) >= cgeq)
# Create a solver and solve.
solver = cp_model.CpSolver()
solution_printer = VarArraySolutionPrinter(x)
status = solver.SearchForAllSolutions(model, solution_printer)
print('Status = %s' % solver.StatusName(status))
print('Number of solutions found: %i' % solution_printer.solution_count())
SearchForAllSolutionsSampleSat()
# I have the recursive relationship of the Hermite Polynomials:
Hn+1(x)=2xHn(x)−2nHn−1(x), n≥1,
H0(x)=1, H1(x)=2x.
I need to write def hermite(x,n) for any hermite polynomial Hn(x) using python 2.7
and make a plot of H5(x) on the interval x∈[−1,1].
Recursion is trivial here since the formula gives it. Just a small trap: you compute Hn(x), not Hn+1(x) so substract 1 to all n occurrences:
def hermite(x,n):
if n==0:
return 1
elif n==1:
return 2*x
else:
return 2*x*hermite(x,n-1)-2*(n-1)*hermite(x,n-2)
small test:
for i in range(0,5):
print(hermite(1,i))
1
2
2
-4
-20
import math
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import hermite
def HERMITE(X,N):
HER = hermite(N)
sn = HER(X)
return sn
xvals = np.linspace(-1.0,1.0,1000)
for n in np.arange(0,7,1):
sol = HERMITE(xvals,n)
plt.plot(xvals,sol,"-.",label = "n = " + str(n),linewidth=2)
plt.xticks(fontsize=14,fontweight="bold")
plt.yticks(fontsize=14,fontweight="bold")
plt.grid()
plt.legend()
plt.show()
import math
import numpy as np
import matplotlib.pyplot as plt
def HER(x,n):
if n==0:
return 1.0 + 0.0*x
elif n==1:
return 2.0*x
else:
return 2.0*x*HER(x,n-1) -2.0*(n-1)*HER(x,n-2)
xvals = np.linspace(-np.pi,np.pi,1000)
for N in np.arange(0,7,1):
sol = HER(xvals,N)
plt.plot(xvals,sol,label = "n = " + str(N))
plt.xticks(fontsize=14,fontweight="bold")
plt.yticks(fontsize=14,fontweight="bold")
plt.grid()
plt.legend()
plt.show()
#Code is working perfectly fine
Feel free to ask any question...
I am running a snippet of code that queries a database and then fills in a pandas dataframe with a value of 1 if that tuple is present in the query. it does this by running the query then iterates over the tuples and fills in the dataframe. However, the query returns almost 8 million rows of data.
My question is if anyone knows how to speed up a process like this. Here is the code below:
user_age = pd.read_sql_query(sql_age, datastore, index_col=['userid']).age.astype(np.int, copy=False)
x = pd.DataFrame(0, index=user_age.index, columns=range(366), dtype=np.int8)
for r in pd.read_sql_query(sql_active, datastore, chunksize=50000):
for userid, day in r.itertuples(index=False):
x.at[userid, day] = 1
Thank you in advance!
You could save some time by replacing the Python loop
for userid, day in r.itertuples(index=False):
x.at[userid, day] = 1
with a NumPy array assignment using "advanced integer indexing":
x[npidx[r['userid']], r['day']] = 1
On a 80000-row DataFrame, using_numpy (below) is about 6x faster:
In [7]: %timeit orig()
1 loop, best of 3: 984 ms per loop
In [8]: %timeit using_numpy()
10 loops, best of 3: 162 ms per loop
import numpy as np
import pandas as pd
def mock_read_sql_query():
np.random.seed(2016)
for arr in np.array_split(index, N//M):
size = len(arr)
df = pd.DataFrame({'userid':arr , 'day':np.random.randint(366, size=size)})
df = df[['userid', 'day']]
yield df
N, M = 8*10**4, 5*10**2
index = np.arange(N)
np.random.shuffle(index)
columns = range(366)
def using_numpy():
npidx = np.empty_like(index)
npidx[index] = np.arange(len(index))
x = np.zeros((len(index), len(columns)), dtype=np.int8)
for r in mock_read_sql_query():
x[npidx[r['userid']], r['day']] = 1
x = pd.DataFrame(x, columns=columns, index=index)
return x
def orig():
x = pd.DataFrame(0, index=index, columns=columns, dtype=np.int8)
for r in mock_read_sql_query():
for userid, day in r.itertuples(index=False):
x.at[userid, day] = 1
return x
expected = orig()
result = using_numpy()
expected_index, expected_col = np.where(expected)
result_index, result_col = np.where(result)
assert np.equal(expected_index, result_index).all()
assert np.equal(expected_col, result_col).all()
I am writing some code using the OpenCV library in Python. In the process, I need to construct a matrix based on another matrix given. Now my code looks like the following:
for x in range(0, width):
for y in range(0, height):
if I_mat[x][y]>=0 and I_mat[x][y]<=c_low:
w_mat[x][y] = float(I_mat[x][y])/c_low
elif I_mat[x][y]>c_low and I_mat[x][y]<c_high:
w_mat[x][y] = 1
else:
w_mat[x][y] = float((255-I_mat[x][y]))/float((255-c_high))
where, I_mat is the input matrix and w_mat is the matrix I am going to construct. Since the input matrix is quite large, this algorithm is quite slow. I wonder if there are any other methods to construct w_mat more efficiently. Thank a lot!
(It is not necessary to show the solution in Python.)
edit:you might want to use numba
import numpy as np
import timeit
from numba import void,jit
c_low = .3
c_high = .6
def func(val):
if val>=0 and val<=c_low:
return float(val)/c_low
elif val>c_low and val<c_high:
return 1.
else:
return (255.-val)/(255.-c_high)
def npvectorize():
global w_mat
vfunc = np.vectorize(func)
w_mat = vfunc(I_mat)
def orig():
for x in range(I_mat.shape[0]):
for y in range(I_mat.shape[1]):
if I_mat[x][y]>=0 and I_mat[x][y]<=c_low:
w_mat[x][y] = float(I_mat[x][y])/c_low
elif I_mat[x][y]>c_low and I_mat[x][y]<c_high:
w_mat[x][y] = 1
else:
w_mat[x][y] = float((255-I_mat[x][y]))/float((255-c_high))
I_mat = np.array(np.random.random((1000,1000)), dtype = np.float)
w_mat = np.empty_like(I_mat)
fast = jit(void(),nopython=True)(orig)
print timeit.Timer(fast).timeit(1)
print timeit.Timer(npvectorize).timeit(1)
print timeit.Timer(orig).timeit(1)
output:
0.0352660446331
0.472590475098
4.78634474265