# Source code for refl1d.lib_numba.convolve

```import numba
from math import erf, sqrt, exp

PI4 = 12.56637061435917295385
PI_180 = 0.01745329251994329576
LN256 = 5.54517744447956247533
SQRT2 = 1.41421356237309504880
SQRT2PI = 2.50662827463100050241
LOG_RESLIMIT = -6.90775527898213703123
root_12_over_2 = sqrt(3)

[docs]
@numba.njit('(f8[:], f8[:], f8[:], f8[:], f8[:])', cache=True, parallel=False)
def convolve_uniform(xi, yi, x, dx, y):
left_index = 0
N_xi = len(xi)
N_x = len(x)
for k in numba.prange(N_x):
x_k = x[k]
# Convert 1-sigma width to 1/2 width of the region
limit = dx[k] * root_12_over_2
# print(f"point {x_k} +/- {limit}")
# Find integration limits, bound by the range of the data
left, right = max(x_k - limit, xi[0]), min(x_k + limit, xi[-1])
if right < left:
# Convolution does not overlap data range.
y[k] = 0.
continue

# Find the starting point for the convolution by first scanning
# forward until we reach the next point greater than the limit
# (we might already be there if the next output point has wider
# resolution than the current point), then scanning backwards to
# get to the last point before the limit. Make sure we have at
# least one interval so that we don't have to check edge cases
# later.
while left_index < N_xi-2 and xi[left_index] < left:
left_index += 1
while left_index > 0 and xi[left_index] > left:
left_index -= 1

# Set the first interval.
total = 0.
right_index = left_index + 1
x1, y1 = xi[left_index], yi[left_index]
x2, y2 = xi[right_index], yi[right_index]

# Subtract the excess from left interval before the left edge.
# print(f" left {left} in {(x1, y1)}, {(x2, y2)}")
if x1 < left:
# Subtract the area of the rectangle from (x1, 0) to (left, y1)
# plus 1/2 the rectangle from (x1, y1) to (left, y'),
# where y' is y value where the line (x1, y1) to (x2, y2)
# intersects x=left. This can be computed as follows:
#    offset = left - x1
#    slope = (y2 - y1)/(x2 - x1)
#    yleft = y1 + slope*offset
#    area = offset * y1 + offset * (yleft-y1)/2
# It can be simplified to the following:
#    area = offset * (y1 + slope*offset/2)
offset = left - x1
slope = (y2 - y1)/(x2 - x1)
area = offset * (y1 + 0.5*slope*offset)
total -= area
# print(f" left correction {area}")

# Do trapezoidal integration up to and including the end interval
while right_index < N_xi-1 and x2 < right:
# Add the current interval if it isn't empty
if x1 != x2:
area = 0.5*(y1 + y2)*(x2 - x1)
total += area
# print(f" adding {(x1,y1)}, {(x2, y2)} as {area}")
# Move to the next interval
right_index += 1
x1, y1, x2, y2 = x2, y2, xi[right_index], yi[right_index]
if x1 != x2:
area = 0.5*(y1 + y2)*(x2 - x1)
total += area
# print(f" adding final {(x1,y1)}, {(x2, y2)} as {area}")

# Subtract the excess from the right interval after the right edge.
# print(f" right {right} in {(x1, y1)}, {(x2, y2)}")
if x2 > right:
# Expression for area to subtract using rectangles is as follows:
#    offset = x2 - right
#    slope = (y2 - y1)/(x2 - x1)
#    yright = y2 - slope*offset
#    area = -(offset * yright + offset * (y2-yright)/2)
# It can be simplified to the following:
#    area = -offset * (y2 - slope*offset/2)
offset = x2 - right
slope = (y2 - y1)/(x2 - x1)
area = offset * (y2 - 0.5*slope*offset)
total -= area
# print(f" right correction {area}")

# Normalize by interval length
if left < right:
# print(f" normalize by length {right} - {left}")
y[k] = total / (right - left)
elif x1 < x2:
# If dx = 0 using the value interpolated at x (with left=right=x).
# print(f" dirac delta at {left} = {right} in {(x1, y1)}, {(x2, y2)}")
offset = left - x1
slope = (y2 - y1)/(x2 - x1)
y[k] = y1 + slope*offset
else:
# At an empty interval in the theory function. Average the y.
# print(f" empty interval with {left} = {right} in {(x1, y1)}, {(x2, y2)}")
y[k] = 0.5*(y1 + y2)

@numba.njit('f8(f8[:], f8[:], i8, i8, f8, f8, f8)', cache=True, parallel=False, locals={
"z": numba.float64,
"Glo": numba.float64,
"erflo": numba.float64,
"erfmin": numba.float64,
"y": numba.float64,
"zhi": numba.float64,
"Ghi": numba.float64,
"erfhi": numba.float64,
"m": numba.float64,
"b": numba.float64,
})
def convolve_gaussian_point(xin, yin, k, n,
xo, limit, sigma):

two_sigma_sq = 2. * sigma * sigma
# double z, Glo, erflo, erfmin, y

z = xo - xin[k]
Glo = exp(-z*z/two_sigma_sq)
erfmin = erflo = erf(-z/(SQRT2*sigma))
y = 0.
# /* printf("%5.3f: (%5.3f,%11.5g)",xo,xin[k],yin[k]); */
while (k < n-1):
k += 1
if (xin[k] != xin[k-1]):
# /* No additional contribution from duplicate points. */

# /* Compute the next endpoint */
zhi = xo - xin[k]
Ghi = exp(-zhi*zhi/two_sigma_sq)
erfhi = erf(-zhi/(SQRT2*sigma))
m = (yin[k]-yin[k-1])/(xin[k]-xin[k-1])
b = yin[k] - m * xin[k]

# /* Add the integrals. */
y += 0.5*(m*xo+b)*(erfhi-erflo) - sigma/SQRT2PI*m*(Ghi-Glo)

# /* Debug computation failures. */
# if isnan(y) {
#     print("NaN from %d: zhi=%g, Ghi=%g, erfhi=%g, m=%g, b=%g\n",
#          % (k,zhi,Ghi,erfhi,m,b))
# }

# /* Save the endpoint for next trapezoid. */
Glo = Ghi
erflo = erfhi

# /* Check if we've calculated far enough */
if (xin[k] >= xo+limit):
break

# /* printf(" (%5.3f,%11.5g)",xin[k<n?k:n-1],yin[k<n?k:n-1]); */

# /* Normalize by the area of the truncated gaussian */
# /* At this point erflo = erfmax */
# /* printf ("---> %11.5g\n",2*y/(erflo-erfmin)); */
return 2 * y / (erflo - erfmin)

# has same performance when using guvectorize instead of njit:
# @numba.guvectorize("(i8, f8[:], f8[:], i8, f8[:], f8[:], f8[:])", '(),(m),(m),(),(n),(n)->(n)')

[docs]
@numba.njit("(f8[:], f8[:], f8[:], f8[:], f8[:])", cache=True, parallel=False, locals={
"sigma": numba.float64,
"xo": numba.float64,
"limit": numba.float64,
"k_in": numba.int64,
"k_out": numba.int64,
})
def convolve_gaussian(xin, yin, x, dx, y):
# size_t in,out;
Nin = len(xin)
Nout = len(x)

# /* FIXME fails if xin are not sorted; slow if x not sorted */
# assert(Nin>1)

# /* Scan through all x values to be calculated */
# /* Re: omp, each thread is going through the entire input array,
# * independently, computing the resolution from the neighbourhood
# * around its individual output points.  The firstprivate(in)
# * clause sets each thread to keep its own copy of in, initialized
# * at in's initial value of zero.  The "schedule(static,1)" clause
# * puts neighbouring points in separate threads, which is a benefit
# * since there will be less backtracking if resolution width increases
# * from point to point.  Because the schedule is static, this does not
# * significantly increase the parallelization overhead.  Because the
# * threads are operating on interleaved points, there should be fewer cache
# * misses than if each thread were given different stretches of x to
# * convolve.
# */
k_in = 0

for k_out in range(Nout):
# /* width of resolution window for x is w = 2 dx^2. */
sigma = dx[k_out]
xo = x[k_out]
limit = sqrt(-2.*sigma*sigma * LOG_RESLIMIT)

# // if (out%20==0)

# /* Line up the left edge of the convolution window */
# /* It is probably forward from the current position, */
# /* but if the next dx is a lot higher than the current */
# /* dx or if the x are not sorted, then it may be before */
# /* the current position. */
# /* FIXME verify that the convolution window is just right */
while (k_in < Nin-1 and xin[k_in] < xo-limit):
k_in += 1
while (k_in > 0 and xin[k_in] > xo-limit):
k_in -= 1

# /* Special handling to avoid 0/0 for w=0. */
if (sigma > 0.):
y[k_out] = convolve_gaussian_point(
xin, yin, k_in, Nin, xo, limit, sigma)
elif (k_in < Nin-1):
# /* Linear interpolation */
m = (yin[k_in+1]-yin[k_in])/(xin[k_in+1]-xin[k_in])
b = yin[k_in] - m*xin[k_in]
y[k_out] = m*xo + b
elif (k_in > 0):
# /* Linear extrapolation */
m = (yin[k_in]-yin[k_in-1])/(xin[k_in]-xin[k_in-1])
b = yin[k_in] - m*xin[k_in]
y[k_out] = m*xo + b
else:
# /* Can't happen because there is more than one point in xin. */
# assert(Nin>1)
pass

```