Source code for refl1d.lib_numba.convolve

import numba
from math import erf, sqrt, exp

PI4 = 12.56637061435917295385
PI_180 = 0.01745329251994329576
LN256 = 5.54517744447956247533
SQRT2 = 1.41421356237309504880
SQRT2PI = 2.50662827463100050241
LOG_RESLIMIT = -6.90775527898213703123
root_12_over_2 = sqrt(3)

[docs] @numba.njit('(f8[:], f8[:], f8[:], f8[:], f8[:])', cache=True, parallel=False) def convolve_uniform(xi, yi, x, dx, y): left_index = 0 N_xi = len(xi) N_x = len(x) for k in numba.prange(N_x): x_k = x[k] # Convert 1-sigma width to 1/2 width of the region limit = dx[k] * root_12_over_2 # print(f"point {x_k} +/- {limit}") # Find integration limits, bound by the range of the data left, right = max(x_k - limit, xi[0]), min(x_k + limit, xi[-1]) if right < left: # Convolution does not overlap data range. y[k] = 0. continue # Find the starting point for the convolution by first scanning # forward until we reach the next point greater than the limit # (we might already be there if the next output point has wider # resolution than the current point), then scanning backwards to # get to the last point before the limit. Make sure we have at # least one interval so that we don't have to check edge cases # later. while left_index < N_xi-2 and xi[left_index] < left: left_index += 1 while left_index > 0 and xi[left_index] > left: left_index -= 1 # Set the first interval. total = 0. right_index = left_index + 1 x1, y1 = xi[left_index], yi[left_index] x2, y2 = xi[right_index], yi[right_index] # Subtract the excess from left interval before the left edge. # print(f" left {left} in {(x1, y1)}, {(x2, y2)}") if x1 < left: # Subtract the area of the rectangle from (x1, 0) to (left, y1) # plus 1/2 the rectangle from (x1, y1) to (left, y'), # where y' is y value where the line (x1, y1) to (x2, y2) # intersects x=left. This can be computed as follows: # offset = left - x1 # slope = (y2 - y1)/(x2 - x1) # yleft = y1 + slope*offset # area = offset * y1 + offset * (yleft-y1)/2 # It can be simplified to the following: # area = offset * (y1 + slope*offset/2) offset = left - x1 slope = (y2 - y1)/(x2 - x1) area = offset * (y1 + 0.5*slope*offset) total -= area # print(f" left correction {area}") # Do trapezoidal integration up to and including the end interval while right_index < N_xi-1 and x2 < right: # Add the current interval if it isn't empty if x1 != x2: area = 0.5*(y1 + y2)*(x2 - x1) total += area # print(f" adding {(x1,y1)}, {(x2, y2)} as {area}") # Move to the next interval right_index += 1 x1, y1, x2, y2 = x2, y2, xi[right_index], yi[right_index] if x1 != x2: area = 0.5*(y1 + y2)*(x2 - x1) total += area # print(f" adding final {(x1,y1)}, {(x2, y2)} as {area}") # Subtract the excess from the right interval after the right edge. # print(f" right {right} in {(x1, y1)}, {(x2, y2)}") if x2 > right: # Expression for area to subtract using rectangles is as follows: # offset = x2 - right # slope = (y2 - y1)/(x2 - x1) # yright = y2 - slope*offset # area = -(offset * yright + offset * (y2-yright)/2) # It can be simplified to the following: # area = -offset * (y2 - slope*offset/2) offset = x2 - right slope = (y2 - y1)/(x2 - x1) area = offset * (y2 - 0.5*slope*offset) total -= area # print(f" right correction {area}") # Normalize by interval length if left < right: # print(f" normalize by length {right} - {left}") y[k] = total / (right - left) elif x1 < x2: # If dx = 0 using the value interpolated at x (with left=right=x). # print(f" dirac delta at {left} = {right} in {(x1, y1)}, {(x2, y2)}") offset = left - x1 slope = (y2 - y1)/(x2 - x1) y[k] = y1 + slope*offset else: # At an empty interval in the theory function. Average the y. # print(f" empty interval with {left} = {right} in {(x1, y1)}, {(x2, y2)}") y[k] = 0.5*(y1 + y2)
@numba.njit('f8(f8[:], f8[:], i8, i8, f8, f8, f8)', cache=True, parallel=False, locals={ "z": numba.float64, "Glo": numba.float64, "erflo": numba.float64, "erfmin": numba.float64, "y": numba.float64, "zhi": numba.float64, "Ghi": numba.float64, "erfhi": numba.float64, "m": numba.float64, "b": numba.float64, }) def convolve_gaussian_point(xin, yin, k, n, xo, limit, sigma): two_sigma_sq = 2. * sigma * sigma # double z, Glo, erflo, erfmin, y z = xo - xin[k] Glo = exp(-z*z/two_sigma_sq) erfmin = erflo = erf(-z/(SQRT2*sigma)) y = 0. # /* printf("%5.3f: (%5.3f,%11.5g)",xo,xin[k],yin[k]); */ while (k < n-1): k += 1 if (xin[k] != xin[k-1]): # /* No additional contribution from duplicate points. */ # /* Compute the next endpoint */ zhi = xo - xin[k] Ghi = exp(-zhi*zhi/two_sigma_sq) erfhi = erf(-zhi/(SQRT2*sigma)) m = (yin[k]-yin[k-1])/(xin[k]-xin[k-1]) b = yin[k] - m * xin[k] # /* Add the integrals. */ y += 0.5*(m*xo+b)*(erfhi-erflo) - sigma/SQRT2PI*m*(Ghi-Glo) # /* Debug computation failures. */ # if isnan(y) { # print("NaN from %d: zhi=%g, Ghi=%g, erfhi=%g, m=%g, b=%g\n", # % (k,zhi,Ghi,erfhi,m,b)) # } # /* Save the endpoint for next trapezoid. */ Glo = Ghi erflo = erfhi # /* Check if we've calculated far enough */ if (xin[k] >= xo+limit): break # /* printf(" (%5.3f,%11.5g)",xin[k<n?k:n-1],yin[k<n?k:n-1]); */ # /* Normalize by the area of the truncated gaussian */ # /* At this point erflo = erfmax */ # /* printf ("---> %11.5g\n",2*y/(erflo-erfmin)); */ return 2 * y / (erflo - erfmin) # has same performance when using guvectorize instead of njit: # @numba.guvectorize("(i8, f8[:], f8[:], i8, f8[:], f8[:], f8[:])", '(),(m),(m),(),(n),(n)->(n)')
[docs] @numba.njit("(f8[:], f8[:], f8[:], f8[:], f8[:])", cache=True, parallel=False, locals={ "sigma": numba.float64, "xo": numba.float64, "limit": numba.float64, "k_in": numba.int64, "k_out": numba.int64, }) def convolve_gaussian(xin, yin, x, dx, y): # size_t in,out; Nin = len(xin) Nout = len(x) # /* FIXME fails if xin are not sorted; slow if x not sorted */ # assert(Nin>1) # /* Scan through all x values to be calculated */ # /* Re: omp, each thread is going through the entire input array, # * independently, computing the resolution from the neighbourhood # * around its individual output points. The firstprivate(in) # * clause sets each thread to keep its own copy of in, initialized # * at in's initial value of zero. The "schedule(static,1)" clause # * puts neighbouring points in separate threads, which is a benefit # * since there will be less backtracking if resolution width increases # * from point to point. Because the schedule is static, this does not # * significantly increase the parallelization overhead. Because the # * threads are operating on interleaved points, there should be fewer cache # * misses than if each thread were given different stretches of x to # * convolve. # */ k_in = 0 for k_out in range(Nout): # /* width of resolution window for x is w = 2 dx^2. */ sigma = dx[k_out] xo = x[k_out] limit = sqrt(-2.*sigma*sigma * LOG_RESLIMIT) # // if (out%20==0) # /* Line up the left edge of the convolution window */ # /* It is probably forward from the current position, */ # /* but if the next dx is a lot higher than the current */ # /* dx or if the x are not sorted, then it may be before */ # /* the current position. */ # /* FIXME verify that the convolution window is just right */ while (k_in < Nin-1 and xin[k_in] < xo-limit): k_in += 1 while (k_in > 0 and xin[k_in] > xo-limit): k_in -= 1 # /* Special handling to avoid 0/0 for w=0. */ if (sigma > 0.): y[k_out] = convolve_gaussian_point( xin, yin, k_in, Nin, xo, limit, sigma) elif (k_in < Nin-1): # /* Linear interpolation */ m = (yin[k_in+1]-yin[k_in])/(xin[k_in+1]-xin[k_in]) b = yin[k_in] - m*xin[k_in] y[k_out] = m*xo + b elif (k_in > 0): # /* Linear extrapolation */ m = (yin[k_in]-yin[k_in-1])/(xin[k_in]-xin[k_in-1]) b = yin[k_in] - m*xin[k_in] y[k_out] = m*xo + b else: # /* Can't happen because there is more than one point in xin. */ # assert(Nin>1) pass