numerical integration python - python-2.7

I need to reduce the running time for quad() in python (I am integrating some thousands integrals). I found a similar question in here where they suggested to do several integrations and add the partial values. However that does not improve performance. Any thoughts? here is a simple example:
import numpy as np
from scipy.integrate import quad
from scipy.stats import norm
import time
funcB = lambda x: norm.pdf(x,0,1)
start = time.time()
good_missclasified,_ = quad(funcB, 0,3.3333)
stop = time.time()
time_elapsed = stop - start
print ('quad : ' + str(time_elapsed))
start = time.time()
num = np.linspace(0,3.3333,10)
Lv = []
last, lastG = 0, 0
for g in num:
Lval,x = quad(funcB, lastG, g)
last, lastG = last + Lval, g
Lv.append(last)
Lv = np.array(Lv)
stop = time.time()
time_elapsed = stop - start
print ('10 int : ' + str(time_elapsed))
print(good_missclasified,Lv[9])

quadpy (a project of mine) is vectorized and can integrate a function over many domains (e.g., intervals) at once. You do have to choose your own integration method though.
import numpy
import quadpy
a = 0.0
b = 1.0
n = 100
start_points = numpy.linspace(a, b, n, endpoint=False)
h = (b-a) / n
end_points = start_points + h
intervals = numpy.array([start_points, end_points])
scheme = quadpy.line_segment.gauss_kronrod(3)
vals = scheme.integrate(numpy.exp, intervals)
print(vals)
[0.10050167 0.10151173 0.10253194 0.1035624 0.10460322 0.1056545
0.10671635 0.10778886 0.10887216 0.10996634 0.11107152 0.11218781
0.11331532 0.11445416 0.11560444 0.11676628 0.1179398 0.11912512
0.12032235 0.12153161 0.12275302 0.12398671 0.12523279 0.1264914
0.12776266 0.1290467 0.13034364 0.13165362 0.13297676 0.1343132
0.13566307 0.1370265 0.13840364 0.13979462 0.14119958 0.14261866
0.144052 0.14549975 0.14696204 0.14843904 0.14993087 0.15143771
0.15295968 0.15449695 0.15604967 0.157618 0.15920208 0.16080209
0.16241818 0.16405051 0.16569924 0.16736455 0.16904659 0.17074554
0.17246156 0.17419482 0.17594551 0.17771379 0.17949985 0.18130385
0.18312598 0.18496643 0.18682537 0.188703 0.1905995 0.19251505
0.19444986 0.19640412 0.19837801 0.20037174 0.20238551 0.20441952
0.20647397 0.20854907 0.21064502 0.21276204 0.21490033 0.21706012
0.21924161 0.22144502 0.22367058 0.22591851 0.22818903 0.23048237
0.23279875 0.23513842 0.2375016 0.23988853 0.24229945 0.2447346
0.24719422 0.24967857 0.25218788 0.25472241 0.25728241 0.25986814
0.26247986 0.26511783 0.2677823 0.27047356]

Related

Implementation of Karger's Algorithm in Python Taking too Long

Wondering if you can help me understand where the critical flaw may be with my attempt at implementing Karger's algorithm in python. My program appears to take far too long to run and my computer starts to overwork running large sets of vertices. The purpose of the program is to output the minimum cut of the graph.
from random import choice
from statistics import mode
import math
fhand = open("mincuts.txt", "r")
vertices = fhand.readlines()
d = {}
for index,line in enumerate(vertices):
d["{0}".format(index+1)] = line.split()
def randy(graph, x):
y = str(choice(list(graph)))
if x == y:
y = randy(graph, x)
return y
count = 0
def contract(graph):
global count
if len(graph) == 2:
a = list(graph.keys())[0]
b = list(graph.keys())[1]
for i in range(1, len(graph[a])):
if graph[a][i] in graph[b]:
count = count + 1
#print(graph)
return
x = str(choice(list(graph)))
y = randy(graph, x)
#print(x)
#print(y)
graph[x] = graph[x] + graph[y]
graph.pop(y)
#remove self loops
for key in graph:
#method to remove duplicate entries in the arrays of the vertices. Source: www.w3schools.com
graph[key] = list(dict.fromkeys(graph[key]))
contract(graph)
N = len(d)
runs = int(N*N*(math.log(N)))
outcomes = []
for i in range(runs):
e = d.copy()
count = 0
contract(e)
outcomes.append(count)
print(outcomes)
#returns most common minimum cut value
print(mode(outcomes))
Below is a link to the graph I am running in mincuts.txt:
https://github.com/BigSoundCode/Misc-Algorithm-Implementations/blob/main/mincuts.txt

trying to use DiscreteUniform as an numpy index

I am trying to use pymc3.DiscreteUniform as an index for a numpy 1D array
This worked with pymc (v2) but I am transitioning to pymc3 and code that worked under pymc don't work under pymc3.
import pymc3 as pm
d0 = pm.DiscreteUniform('d0', lower=0, upper=nDens - 1, testval = nDens//2)
pred = np.zeros(len(box.match), np.float64)
for iwvl, amatch in enumerate(box.match):
pred[iwvl] += amatch['intensitySum'][d0]
I get the following error message:
IndexError: only integers, slices (:), ellipsis (...), numpy.newaxis (None) and integer or boolean arrays are valid indices
I have found something that works but in involves going into theano and theano.tensor.
`
import pymc3 as pm
with pm.Model() as model:
em0 = pm.Normal('em0', mu=emLog, sigma=0.2)
d0 = pm.DiscreteUniform('d0', lower = 0, upper = nDens - 1, testval = Dindex)
boundNormal = pm.Bound(pm.Normal, lower=0.0)
wght = boundNormal('wght', mu=0.2, sigma=0.1)
pred = np.zeros((nDens, len(box.match)), np.float64)
for iwvl, amatch in enumerate(box.match):
pred[0:,iwvl] += amatch['intensitySum']
xpred = theano.shared(pred, name='p0')
idx = tensor.as_tensor_variable(d0)
predicted = xpred[idx]*10.**em0
nObs = len(box.match)
intensity = np.zeros(nObs, np.float64)
for iwvl in range(nObs):
intensity[iwvl] = box.match[iwvl]['obsIntensity']
sigma = 0.2
Y_obs = pm.Normal('Y_obs', mu=predicted, sigma=wght*intensity, observed=intensity)
trace = pm.sample(tune=20000, draws=100000, target_accept=0.85)`
and then you can work with the trace
it is even possible to make sigma as pm variable

Showing key error

I am trying to use a recomendation engine to predict thr top selling product,it is showing key error,i am doing it with python2 anaconda jupyter notebook.hw i can over come from this error
import pandas as pd
import numpy as np
import operator
SMOOTHING_WINDOW_FUNCTION = np.hamming
SMOOTHING_WINDOW_SIZE = 7
def train():
df = pd.read_csv('C:\\Users\SHIVAPRASAD\Desktop\sample-cart-add-data
(1).csv')
df.sort_values(by=['id', 'age'], inplace=True)
trends = pd.pivot_table(df, values='count', index=['id', 'age'])
trend_snap = {}
for i in np.unique(df['id']):
trend = np.array(trends[i])
smoothed = smooth(trend, SMOOTHING_WINDOW_SIZE,
SMOOTHING_WINDOW_FUNCTION)
nsmoothed = standardize(smoothed)
slopes = nsmoothed[1:] - nsmoothed[:-1]
# I blend in the previous slope as well, to stabalize things a bit
# give a boost to things that have been trending for more than1day[![key error][1]][1]
if len(slopes) > 1:
trend_snap[i] = slopes[-1] + slopes[-2] * 0.5
return sorted(trend_snap.items(), key=operator.itemgetter(1),
reverse=True)
def smooth(series, window_size, window):
ext = np.r_[2 * series[0] - series[window_size-1::-1],
series,
2 * series[-1] - series[-1:-window_size:-1]]
weights = window(window_size)
smoothed = np.convolve(weights / weights.sum(), ext, mode='same')
return smoothed[window_size:-window_size+1]
def standardize(series):
iqr = np.percentile(series, 75) - np.percentile(series, 25)
return (series - np.median(series)) / iqr
trending = train()
print "Top 5 trending products:"
for i, s in trending[:5]:
print "Product %s (score: %2.2f)" % (i, s)
insted of
trend = np.array(trends[i]) use trend = np.array(trends.loc[i])

Fastest way to run calculations on a list of lists

I have a list of lists like so:
import numpy as np
import random
import time
import itertools
N = 1000
x =np.random.random((N,N))
y = np.zeros((N,N))
z = np.random.random((N,N))
list_of_lists = [[x, y], [y,z], [z,x]]
and for each sublist I want to calculate the number of non zeros, the mean and the standard deviation.
I have done that like so:
distribution = []
alb_mean = []
alb_std = []
start = time.time()
for i in range(len(list_of_lists)):
one_mean = []
non_zero_l = []
one_list = list_of_lists[i]
for n in one_list:
#count non_zeros
non_zero_count = np.count_nonzero(n)
non_zero_l.append(non_zero_count)
#assign nans
n = n.astype(float)
n[n == 0.0] = np.nan
#flatten the matrix
n = np.array(n.flatten())
one_mean.append(n)
#append means and stds
distribution.append(sum(non_zero_l))
alb_mean.append(np.nanmean(one_mean))
alb_std.append(np.nanstd(one_mean))
end = time.time()
print "Loop took {} seconds".format((end - start))
which takes 0.23 seconds.
I tried to make this faster with a second option:
distribution = []
alb_mean = []
alb_std = []
start = time.time()
for i in range(len(list_of_lists)):
for_mean = []
#get one list
one_list = list_of_lists[i]
#flatten the list
chain = itertools.chain(*one_list)
flat = list(chain)
#count non_zeros
non_zero_count = np.count_nonzero(flat)
distribution.append(non_zero_count)
#remove zeros
remove_zero = np.setdiff1d(flat ,[0.0])
alb_mean.append(np.nanmean(remove_zero))
alb_std.append(np.nanstd(remove_zero))
end = time.time()
print "Loop took {} seconds".format((end - start))
which is actually slower and takes 0.88 seconds.
The sheer amount of loops has me thinking there is a better way to do this. I have tried numba but it doesn't seam to like appending in a function.
Version #1
Well in your sample with the loopy solution, you are looping with two loops - One with 3 iterations and another with 2 iterations. So, it's already close to being a vectorized one. The only bottlenecks being the append steps.
Going fully vectorized, here's one approach -
a = np.array(list_of_lists, dtype=float)
zm = a!=0
avgs = np.einsum('ijkl,ijkl->i',zm,a)/zm.sum(axis=(1,2,3)).astype(float)
a[~zm] = np.nan
stds = np.nanstd(a, axis=(1,2,3))
Using the same setup as in the question, here's what I get on timings -
Loop took 0.150925159454 seconds
Proposed solution took 0.121352910995 seconds
Version #2
We could compute std using average, thus re-use avgs for further boost :
Thus, a modified version would be -
a = np.asarray(list_of_lists)
zm = a!=0
N = zm.sum(axis=(1,2,3)).astype(float)
avgs = np.einsum('ijkl,ijkl->i',zm,a)/N
diffs = ((a-avgs[:,None,None,None])**2)
stds = np.sqrt(np.einsum('ijkl,ijkl->i',zm,diffs)/N)
Updated timings -
Loop took 0.155035018921 seconds
Proposed solution took 0.0648851394653 seconds

generate N random numbers from a skew normal distribution using numpy

I need a function in python to return N random numbers from a skew normal distribution. The skew needs to be taken as a parameter.
e.g. my current use is
x = numpy.random.randn(1000)
and the ideal function would be e.g.
x = randn_skew(1000, skew=0.7)
Solution needs to conform with: python version 2.7, numpy v.1.9
A similar answer is here: skew normal distribution in scipy However this generates a PDF not the random numbers.
I start by generating the PDF curves for reference:
NUM_SAMPLES = 100000
SKEW_PARAMS = [-3, 0]
def skew_norm_pdf(x,e=0,w=1,a=0):
# adapated from:
# http://stackoverflow.com/questions/5884768/skew-normal-distribution-in-scipy
t = (x-e) / w
return 2.0 * w * stats.norm.pdf(t) * stats.norm.cdf(a*t)
# generate the skew normal PDF for reference:
location = 0.0
scale = 1.0
x = np.linspace(-5,5,100)
plt.subplots(figsize=(12,4))
for alpha_skew in SKEW_PARAMS:
p = skew_norm_pdf(x,location,scale,alpha_skew)
# n.b. note that alpha is a parameter that controls skew, but the 'skewness'
# as measured will be different. see the wikipedia page:
# https://en.wikipedia.org/wiki/Skew_normal_distribution
plt.plot(x,p)
Next I found a VB implementation of sampling random numbers from the skew normal distribution and converted it to python:
# literal adaption from:
# http://stackoverflow.com/questions/4643285/how-to-generate-random-numbers-that-follow-skew-normal-distribution-in-matlab
# original at:
# http://www.ozgrid.com/forum/showthread.php?t=108175
def rand_skew_norm(fAlpha, fLocation, fScale):
sigma = fAlpha / np.sqrt(1.0 + fAlpha**2)
afRN = np.random.randn(2)
u0 = afRN[0]
v = afRN[1]
u1 = sigma*u0 + np.sqrt(1.0 -sigma**2) * v
if u0 >= 0:
return u1*fScale + fLocation
return (-u1)*fScale + fLocation
def randn_skew(N, skew=0.0):
return [rand_skew_norm(skew, 0, 1) for x in range(N)]
# lets check they at least visually match the PDF:
plt.subplots(figsize=(12,4))
for alpha_skew in SKEW_PARAMS:
p = randn_skew(NUM_SAMPLES, alpha_skew)
sns.distplot(p)
And then wrote a quick version which (without extensive testing) appears to be correct:
def randn_skew_fast(N, alpha=0.0, loc=0.0, scale=1.0):
sigma = alpha / np.sqrt(1.0 + alpha**2)
u0 = np.random.randn(N)
v = np.random.randn(N)
u1 = (sigma*u0 + np.sqrt(1.0 - sigma**2)*v) * scale
u1[u0 < 0] *= -1
u1 = u1 + loc
return u1
# lets check again
plt.subplots(figsize=(12,4))
for alpha_skew in SKEW_PARAMS:
p = randn_skew_fast(NUM_SAMPLES, alpha_skew)
sns.distplot(p)
from scipy.stats import skewnorm
a=10
data= skewnorm.rvs(a, size=1000)
Here, a is a parameter which you can refer to:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.skewnorm.html
Adapted from rsnorm function from fGarch R package
def random_snorm(n, mean = 0, sd = 1, xi = 1.5):
def random_snorm_aux(n, xi):
weight = xi/(xi + 1/xi)
z = numpy.random.uniform(-weight,1-weight,n)
xi_ = xi**numpy.sign(z)
random = -numpy.absolute(numpy.random.normal(0,1,n))/xi_ * numpy.sign(z)
m1 = 2/numpy.sqrt(2 * numpy.pi)
mu = m1 * (xi - 1/xi)
sigma = numpy.sqrt((1 - m1**2) * (xi**2 + 1/xi**2) + 2 * m1**2 - 1)
return (random - mu)/sigma
return random_snorm_aux(n, xi) * sd + mean