Low-level function tests & benchmarks

22/02/20

Aims:

  • Test and improve speed for BLM calculations, currently pure python & looped in ep.mfblm().
  • Develop faster routines & indexing methods.
  • Employ Numba for compilation & parallelism (already employed by Moble’s Spherical Functions library, which is currently used for Wigner3j and WignerD functions in ePSproc - see previous tests and benchmarks for more details).
  • Any other algorithm & method developments…

Setup

Import required libraries. Bring in xyzpy for benchmarking utils & high-level parallelism.

[1]:
# Imports
import numpy as np
import pandas as pd
import xarray as xr
from functools import lru_cache  # For function result caching

# Special functions
# from scipy.special import sph_harm
import spherical_functions as sf
import quaternion

# Performance & benchmarking libraries
from joblib import Memory
import xyzpy as xyz
import numba as nb

# Timings with ttictoc
# https://github.com/hector-sab/ttictoc
from ttictoc import TicToc

# Package fns.
# For module testing, include path to module here
import sys
import os
modPath = r'D:\code\github\ePSproc'
sys.path.append(modPath)
import epsproc as ep
# TODO: tidy this up!
from epsproc.util import matEleSelector
* pyevtk not found, VTK export not available.

3j function parallelization

On the backend, currently using sf.wigner3j (based on sympy.pyhsics.Wigner). This is pretty fast, using vector look-ups and @njit compilation, but is not parallelised over quantum number inputs (in contrast to the WignerD implementation, which is parallelised over either rotation angles/quaternions or QNs).

For photoionization calculations, products and sums over multiple 3j symbols are the norm, and may be repeated many times (see here), so it’s useful to make things as fast and efficient as possible, and consider low-level stuff carefully: what can be parallelised, what can be done via precalculation and fast lookup? How best to index over ND vector space?

[5]:
# Define sets of QNs (l, lp, L, m, mp, M) to use for calculations
# Return
def setQNs(Lmin = 0, Lmax = 4, mFlag = True):
    # Set QNs for calculation, (l,m,mp)
    QNs = []
    for l in np.arange(Lmin, Lmax+1):
        for lp in np.arange(Lmin, Lmax+1):

            if mFlag:
                mMax = l
                mpMax = lp
            else:
                mMax = 0
                mpMax = 0

            for m in np.arange(-mMax, mMax+1):
                for mp in np.arange(-mpMax, mpMax+1):
                    for L in np.arange(np.abs(l-lp), l+lp+1):
                        M = -(m+mp)
                        QNs.append([l, lp, L, m, mp, M])

    QNs = np.array(QNs)
    w3j_QNs = np.zeros(QNs.shape[0])  # Set w3j to hold results - now added to QNs for consistency over functions
#     QNs = np.c_[QNs, np.zeros(QNs.shape[0])]  # Use final col. to hold results

    return QNs, w3j_QNs

# Wrappers for function tests
# NOTE: For consistent benchmarking with xyzpy, set to single input to map. There's probably a better way to do this!

# Bare function with loop, pass w3j_QNs
def Wigner3jQNs(QNs,w3j_QNs):
    for n in range(QNs.shape[0]):
        w3j_QNs[n] = sf.Wigner3j(QNs[n,0], QNs[n,1], QNs[n,2], QNs[n,3], QNs[n,4], QNs[n,5])

    return w3j_QNs


# Parallelised 3j using Numba prange, see sf_function_tests_110220.py
@nb.njit(parallel = True)
def w3jprange(QNs, w3j_QNs):
#     w3j_QNs = np.zeros(QNs.shape[0])
    for n in nb.prange(0, QNs.shape[0]):
        w3j_QNs[n] = sf.Wigner3j(QNs[n,0], QNs[n,1], QNs[n,2], QNs[n,3], QNs[n,4], QNs[n,5])
    return w3j_QNs

# Vectorized 3j, see sf_function_tests_110220.py
# Use Numba to compile function, vecotrised over input sets of QNs, doesn't return, but writes directly to passed 1D list of corresponding 3j terms
@nb.guvectorize(["void(int32[:,:], float64[:])"], '(n,m)->(n)', target = 'parallel')
def w3jguVecCPU(QNs,w3j_QNs):
    for n in range(QNs.shape[0]):
        w3j_QNs[n] = sf.Wigner3j(QNs[n,0], QNs[n,1], QNs[n,2], QNs[n,3], QNs[n,4], QNs[n,5])
# @nb.guvectorize(["void(float64[:,:])"], '(n,m)->(n,m)', target = 'parallel')
# def w3jguVecCPU(QNs):
#     for n in range(QNs.shape[0]):
#         QNs[n,QNs.shape[1]-1] = sf.Wigner3j(QNs[n,0], QNs[n,1], QNs[n,2], QNs[n,3], QNs[n,4], QNs[n,5])

# Try wrapping vec version for caching too...
# Doesn't work in this form - doesn't like np.array input type
# @lru_cache(maxsize = None)
# def Wigner3jguVecCPULRUCached(QNs,w3j_QNs):
#     w3jguVecCPU(QNs,w3j_QNs)

cachedir = r'E:\temp\memCache'  # SSD cache - is it possible to set RAM cache here?
memory = Memory(cachedir, verbose=0)
@memory.cache
def Wigner3jguVecCPUMemory(QNs,w3j_QNs):
    w3jguVecCPU(QNs,w3j_QNs)
# def Wigner3jguVecCPUMemory(QNs):
#     w3jguVecCPU(QNs,w3j_QNs)

# To try - other high-level parallelisation methods, e.g. xyzpy parallel, joblib

Benchmark for range of QN arrays with xyzpy.benchmarker

Results shown here for benchmarking on an AMD Threadripper 1950X (16 core) system, Feb. 2020.

[7]:
# Benchmarks with xyzpy
# https://xyzpy.readthedocs.io/en/latest/utilities.html

# Set test kernels
kernels = [
    Wigner3jQNs,
    w3jprange,
    w3jguVecCPU,
    Wigner3jguVecCPUMemory,
]

# Set benchmarker object
benchmarker = xyz.Benchmarker(
    kernels,
#     setup=setQNs,
    setup=lambda n: setQNs(Lmax = n),  # Use lambda fn to ensure correct input here, see also Perfplot https://github.com/nschloe/perfplot
    benchmark_opts={'min_t': 0.01, 'starmap': True}   # Use starmap = true to pass all args, foo(*setup(n))
)

# Run benchmarks for all kernels
Lrange = [i for i in range(0, 15)]
benchmarker.run(Lrange, verbosity=2)
{'n': 14, 'kernel': 'Wigner3jguVecCPUMemory'}: 100%|###################################| 60/60 [01:25<00:00,  1.43s/it]
[8]:
benchmarker.ilineplot()
Loading BokehJS ...
[8]:
<xyzpy.plot.plotter_bokeh.ILinePlot at 0x291ae1d41d0>

Prange case pretty much all the way!

[9]:
# AS above, but m=0 terms only
benchmarker = xyz.Benchmarker(
    kernels,
#     setup=setQNs,
    setup=lambda n: setQNs(Lmax = n, mFlag = False),  # Use lambda fn to ensure correct input here, and allow for other options, see also Perfplot https://github.com/nschloe/perfplot
    benchmark_opts={'min_t': 0.01, 'starmap': True}   # Use starmap = true to pass all args, foo(*setup(n))
)

# Run benchmarks for all kernels
# HOW DOES ONE PASS ADDITIONAL ARGS here????  AH, with starmap = True above, nice
Lrange = [i for i in range(0, 15)]
benchmarker.run(Lrange, verbosity=2)   # Note this will fail upon repeat with different Lrange unless benchmarker object is reset

benchmarker.ilineplot()
{'n': 14, 'kernel': 'Wigner3jguVecCPUMemory'}: 100%|###################################| 60/60 [00:04<00:00, 13.58it/s]
[9]:
<xyzpy.plot.plotter_bokeh.ILinePlot at 0x291ae023e10>

In this case vectorized case is fastest for n<7.