Source code for epsproc.util.misc

# -*- coding: utf-8 -*-
"""
ePSproc convenience functions

Collection of small functions for sorting etc.


"""

# import numpy as np
import re
import itertools
import os
from datetime import datetime
# import scipy.constants
#
# # Package fns.
# from epsproc.basicPlotters import molPlot

try:
    from natsort import natsorted  # For natural sorting
    natsortFlag = True

except ImportError as e:
    if e.msg != "No module named 'natsort'":
        raise
    print('* natsort not found, some sorting functions not available. ')
    natsortFlag = False

#***************** Convenience functions...

# Multistring replace
# See https://gist.github.com/bgusach/a967e0587d6e01e889fd1d776c5f3729
# https://stackoverflow.com/questions/6116978/how-to-replace-multiple-substrings-of-a-string
[docs]def stringRepMap(string, replacements, ignore_case=False):
    """
    Given a string and a replacement map, it returns the replaced string.
    :param str string: string to execute replacements on
    :param dict replacements: replacement dictionary {value to find: value to replace}
    :param bool ignore_case: whether the match should be case insensitive
    :rtype: str

    CODE from:
    https://gist.github.com/bgusach/a967e0587d6e01e889fd1d776c5f3729
    https://stackoverflow.com/questions/6116978/how-to-replace-multiple-substrings-of-a-string
    ... more or less verbatim.

    Thanks to *bgusach* for the Gist.

    """
    # If case insensitive, we need to normalize the old string so that later a replacement
    # can be found. For instance with {"HEY": "lol"} we should match and find a replacement for "hey",
    # "HEY", "hEy", etc.
    if ignore_case:
        def normalize_old(s):
            return s.lower()

        re_mode = re.IGNORECASE

    else:
        def normalize_old(s):
            return s

        re_mode = 0

    replacements = {normalize_old(key): val for key, val in replacements.items()}

    # Place longer ones first to keep shorter substrings from matching where the longer ones should take place
    # For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce
    # 'hey ABC' and not 'hey ABc'
    rep_sorted = sorted(replacements, key=len, reverse=True)
    rep_escaped = map(re.escape, rep_sorted)

    # Create a big OR regex that matches any of the substrings to replace
    pattern = re.compile("|".join(rep_escaped), re_mode)

    # For each match, look up the new string in the replacements, being the key the normalized old string
    return pattern.sub(lambda match: replacements[normalize_old(match.group(0))], string)

# Sort a 2D numpy array.
[docs]def arraySort2D(a, col):
    """
    Sort np.array `a` by specified column `col`.
    From https://thispointer.com/sorting-2d-numpy-array-by-column-or-row-in-python/
    """
    return a[a[:,col].argsort()]


# Set up lambdas for itertools groupby use in fileListSort (below)
# Quick hack to get this working for different file-naming conventions.
# TODO: make nicer & generalise.
# TODO: consider cases with/without prefix str for single and multiple dirs - that's the main difference with prefixStr...?
# 28/04/21 - currently broken for wavefn files, must have changed this for other purposes AFTER https://epsproc.readthedocs.io/en/dev/demos/ePSproc_wfPlot_tests_150720-110820-CH3I-tidy_Stimpy.html ?
#           Quick fix by also matching by file type for orb data files (.dat)
#           Should really use regex here!
[docs]def sortGroupFn(fListSorted, prefixStr):

    # (1) Original case, works for wavefunction files with naming convention
    #  <jobSym>_<Eke>_Orb.dat, e.g. CH3ISA1CA1_1.0eV_Orb.dat
    #  In this case, split and group on first part of file name
    partName = fListSorted[0].replace(prefixStr,'')
    if (len(partName.split('_')) < 2) or (partName.endswith('.dat')):
        return lambda x:x.replace(prefixStr,'').split('_')[0]

    # (2) Case for multi-E ePS job output files.
    #  <job>.<orb>_<Sym>_<Eke>.out, e.g. N2O_wf.orb1_S_E1.0_6.0_97.0eV.inp.out
    # In this case, just group be prefix, which should be OK if only a single dir is set.
    # Should likely also check for file extension or other here?
    else:
        # return lambda x:prefixStr  # Use prefix str only
        return lambda x:x.split('_E')[0]  # Check from full name, no additional prefixStr required.



# Sort & group filenames
[docs]def fileListSort(fList, groupByPrefix=True, prefixStr = None, verbose=1):
    """
    Sort a list of file names, and group by prefix.

    Note: this currently assumes a file name schema whereby split('_')[0] picks the grouping string.

    Note: os.path.commonprefix() is used for determining prefix, this may fail in some cases (e.g. for cases where a single file is passed, or files from different dirs).
    Pass prefix manaully in these cases.

    Returns
    -------
    fListSorted, groupedList, prefixStr


    """

    if natsortFlag:
        fListSorted = natsorted(fList)
    else:
        fListSorted = sorted(fList)

    # prefixStr = ''
    if groupByPrefix:
        if prefixStr is None:
            prefixStr = os.path.commonprefix(fListSorted)  # Find common prefix if not passed.

        # Solution with itertools groupby
        # Adapted from https://stackoverflow.com/a/13368753
        # groupedList = [list(v) for k,v in itertools.groupby(fListSorted,key=lambda x:x.replace(prefixStr,'').split('_')[0])]
        groupedList = [list(v) for k,v in itertools.groupby(fListSorted,key=sortGroupFn(fListSorted, prefixStr))]


    if verbose:
        print(f"\n*** FileListSort \n  Prefix: {prefixStr} \n  {len(groupedList)} groups.")

        if verbose > 1:
            print("\n  Grouped list:")
            print(*groupedList, sep = '\n')

    if len(fList) > 1:
        return fListSorted, groupedList, prefixStr
    else:
        return fList, fList, None


# Return a time-string for setting unique file names
# May already have this elsewhere...?
[docs]def timeStamp():
    """Get local time and return formatted string "%d-%m-%y_%H-%M-%S" for time-stamping filesnames."""

    dt = datetime.now()

    return dt.strftime("%d-%m-%y_%H-%M-%S")


[docs]def checkDims(data, refDims = []):
    """
    Check dimensions for a data array (Xarray) vs. a reference list (or dict).

    Returns dictionary of dims, intersection and differences.

    TODO: check and order dims by size? Otherwise set return is alphebetical

    11/05/21 Added handling for stacked dims.

    """
    dims = data.dims # Set dim list - this excludes stacked dims
    dimsUS = data.unstack().dims  # Set unstaked (full) dim list

    stackedDims = list(set(dims) - set(dimsUS))

    # Check ref vs. full dim list
    sharedDims = list(set(dimsUS)&{*refDims})  # Intersection
    extraDims = list(set(dimsUS) - {*refDims})  # Difference
    invalidDims = list({*refDims} - set(dimsUS))

    return {'dataDims':dims, 'dataDimsUS':dimsUS, 'refDims':refDims, 'shared':sharedDims,
            'extra':extraDims, 'stacked':stackedDims, 'invalid':invalidDims}


# Subselect from sharedDims
[docs]def subselectDims(data, refDims = []):
    """
    Subselect dims from shared dim dict.
    Check dimensions for a data array (Xarray) vs. a reference list.

    Used to set safe selection criteria in matEleSelector.
    """

    # Check dims
    dimSets = checkDims(data, refDims)

    # Subselect
    if isinstance(refDims,dict):
        # Return dim with only subselected keys, i.e. existing dims.
        return {k:v for k,v in refDims.items() if k in dimSets['shared']}

    else:
        return dimsSets['shared']  # Return shared dim list only.