Source code for SOAPify.saponify

"""Submodule that contains the workhorse routines to apply the SOAP calculations
"""
import time
from typing import Iterable
import h5py
import numpy

from .HDF5er import (
    HDF52AseAtomsChunckedwithSymbols as HDF2ase,
    isTrajectoryGroup,
)
from .engine import SOAPengineContainer, getSoapEngine, KNOWNSOAPENGINES


def _saponifyWorker(
    trajGroup: h5py.Group,
    SOAPoutDataset: h5py.Dataset,
    soapEngine: SOAPengineContainer,
    SOAPOutputChunkDim: int = 100,
    SOAPnJobs: int = 1,
    verbose: bool = True,
):
    """Calculates the soap descriptor and store the result in the given dataset

    Args:
        trajGroup (h5py.Group):
            the group that contains the trajectory (must contain "Box",
            "Trajectory" and "Types" datasets)
        SOAPoutDataset (h5py.Dataset):
            The preformed dataset for storing the SOAP results
        soapEngine (SOAPengineContainer):
            The soap engine already set up
        SOAPOutputChunkDim (int, optional):
            The dimension of the chunck of data in the SOAP results dataset.
            Defaults to 100.
        SOAPnJobs (int, optional):
            the number of concurrent SOAP calculations (option passed to the
            desired SOAP engine). Defaults to 1.
        verbose (bool, optional):
            regulates the verbosity of the step by step operations.
            Defaults to True.
    """
    symbols = trajGroup["Types"].asstr()[:]
    SOAPoutDataset.attrs["SOAPengine"] = soapEngine.SOAPenginekind
    SOAPoutDataset.attrs["l_max"] = soapEngine.lmax
    SOAPoutDataset.attrs["n_max"] = soapEngine.nmax
    SOAPoutDataset.attrs["r_cut"] = soapEngine.rcut
    SOAPoutDataset.attrs["species"] = soapEngine.species
    # this should not be needed, given how the preparation of the dataset works
    # if soapEngine.centersMask is None:
    #     if "centersIndexes" in SOAPoutDataset.attrs:
    #         del SOAPoutDataset.attrs["centersIndexes"]
    # else:
    if soapEngine.centersMask is not None:
        SOAPoutDataset.attrs.create("centersIndexes", soapEngine.centersMask)
        # print(centersMask)
        # print(SOAPoutDataset.attrs["centersIndexes"])
        # print(type(SOAPoutDataset.attrs["centersIndexes"]))

    nspecies = len(soapEngine.species)
    for i in range(nspecies):
        for j in range(nspecies):
            if soapEngine.crossover or (i == j):
                temp = soapEngine.getLocation(
                    soapEngine.species[i], soapEngine.species[j]
                )
                SOAPoutDataset.attrs[
                    f"species_location_{soapEngine.species[i]}-{soapEngine.species[j]}"
                ] = (temp.start, temp.stop)

    for chunkTraj in trajGroup["Trajectory"].iter_chunks():
        chunkBox = (chunkTraj[0], slice(0, 6, 1))
        if verbose:
            print(f'working on trajectory chunk "{chunkTraj}"')
            print(f'   and working on box chunk "{repr(chunkBox)}"')
        # load in memory a chunk of data
        atoms = HDF2ase(trajGroup, chunkTraj, chunkBox, symbols)
        jobchunk = min(SOAPOutputChunkDim, len(atoms))
        jobStart = 0
        jobEnd = jobStart + jobchunk
        while jobStart < len(atoms):
            tStart = time.time()
            frameStart = jobStart + chunkTraj[0].start
            frameEnd = jobEnd + chunkTraj[0].start
            if verbose:
                print(f"working on frames: [{frameStart}:{frameEnd}]")
            # TODO: dscribe1.2.1 return (nat,nsoap) instead of (1,nat,nsoap) with 1 frame input!
            SOAPoutDataset[frameStart:frameEnd] = soapEngine(
                atoms[jobStart:jobEnd],
                positions=[soapEngine.centersMask] * jobchunk,
                n_jobs=SOAPnJobs,
            )
            tStop = time.time()
            jobchunk = min(SOAPOutputChunkDim, len(atoms) - jobEnd)
            jobStart = jobEnd
            jobEnd = jobStart + jobchunk
            if verbose:
                print(f"delta create= {tStop-tStart}")


def _applySOAP(
    trajContainer: h5py.Group,
    SOAPoutContainer: h5py.Group,
    key: str,
    soapEngine: SOAPengineContainer,
    SOAPOutputChunkDim: int = 100,
    SOAPnJobs: int = 1,
    doOverride: bool = False,
    verbose: bool = True,
    useType="float64",
):
    """helper function: applies the soap engine to the given trajectory within the trajContainer

    Args:
        trajContainer (h5py.Group):
            The group or the file that contains the trajectory, must have the
            following dataset in '/': "Box", "Trajectory" and "Types"
        SOAPoutContainer (h5py.Group)
             The group where the dataset with the  SOAP fingerprints will be saved
        key (str):
            the name of the dataset to be saved, if exist will be overidden
        soapEngine (SOAPengineContainer):
            the contained of the soap engine
        SOAPOutputChunkDim (int, optional):
            The chunk of trajectory that will be loaded in memory to be calculated,
            if key is a new dataset will also be the size of the main chunck of
            data of the SOAP dataset . Defaults to 100.
        SOAPnJobs (int, optional):
            Number of concurrent SOAP calculations. Defaults to 1.
        doOverride (bool, optional):
            if False will raise and exception if the user ask to override an
            already existing DataSet. Defaults to False.
        verbose (bool, optional):
            regulates the verbosity of the step by step operations.
            Defaults to True.
        useType (str,optional):
            The precision used to store the data. Defaults to "float64".
    """
    useType = numpy.dtype(useType)
    nOfFeatures = soapEngine.features
    symbols = trajContainer["Types"].asstr()[:]
    nCenters = (
        len(symbols) if soapEngine.centersMask is None else len(soapEngine.centersMask)
    )

    if key in SOAPoutContainer.keys():
        if doOverride is False:
            raise ValueError(
                f"Are you sure that you want to override {SOAPoutContainer[key].name}?"
            )
        # doOverride is True and key in SOAPoutContainer.keys():
        # check if deleting the dataset is necessary:
        oldshape = SOAPoutContainer[key].shape
        if oldshape[1] != nCenters or oldshape[2] != nOfFeatures:
            del SOAPoutContainer[key]
    if key not in SOAPoutContainer.keys():
        SOAPoutContainer.create_dataset(
            key,
            (0, nCenters, nOfFeatures),
            compression="gzip",
            compression_opts=9,
            chunks=(SOAPOutputChunkDim, nCenters, nOfFeatures),
            maxshape=(None, nCenters, nOfFeatures),
            dtype=useType,
        )
    SOAPout = SOAPoutContainer[key]
    SOAPout.resize((len(trajContainer["Trajectory"]), nCenters, nOfFeatures))
    _saponifyWorker(
        trajContainer,
        SOAPout,
        soapEngine,
        SOAPOutputChunkDim,
        SOAPnJobs,
        verbose=verbose,
    )


[docs]def saponifyMultipleTrajectories( trajContainers: "h5py.Group|h5py.File", SOAPoutContainers: "h5py.Group|h5py.File", SOAPrcut: float, SOAPnmax: int, SOAPlmax: int, SOAPOutputChunkDim: int = 100, SOAPnJobs: int = 1, SOAPatomMask: "list[str]" = None, centersMask: Iterable = None, SOAP_respectPBC: bool = True, SOAPkwargs: dict = None, useSoapFrom: KNOWNSOAPENGINES = "dscribe", doOverride: bool = False, verbose: bool = True, useType="float64", ): """Calculates and stores the SOAP descriptor for all of the trajectories in the given group/file `saponifyMultipleTrajectories` checks if any of the group contained in `trajContainers` is a "trajectory group" (see :func:`SOAPify.HDF5er.HDF5erUtils.isTrajectoryGroup`) and then calculates the soap fingerprints for that trajectory and saves the result in a dataset within the `SOAPoutContainers` group or file `SOAPatomMask` and `centersMask` are mutually exclusive (see :func:`SOAPify.engine.getSoapEngine`) Args: trajContainers (h5py.Group|h5py.File): The file/group that contains the trajectories SOAPoutContainers (h5py.Group|h5py.File) The file/group that will store the SOAP results SOAPrcut (float) The cutoff for local region in angstroms. Should be bigger than 1 angstrom (option passed to the desired SOAP engine). Defaults to 8.0. SOAPnmax (int) The number of radial basis functions (option passed to the desired SOAP engine). Defaults to 8. SOAPlmax (int) The maximum degree of spherical harmonics (option passed to the desired SOAP engine). Defaults to 8. SOAPOutputChunkDim (int, optional) The dimension of the chunck of data in the SOAP results dataset. Defaults to 100. SOAPnJobs (int, optional) the number of concurrent SOAP calculations (option passed to the desired SOAP engine). Defaults to 1. SOAPatomMask (list[str], optional) the symbols of the atoms whose SOAP fingerprint will be calculated (option passed to getSoapEngine). Defaults to None. centersMask (Iterable, optional) the indexes of the atoms whose SOAP fingerprint will be calculated (option passed getSoapEngine). Defaults to None. SOAP_respectPBC (bool, optional) Determines whether the system is considered to be periodic (option passed to the desired SOAP engine). Defaults to True. SOAPkwargs (dict, optional) additional keyword arguments to be passed to the selected SOAP engine. Defaults to {}. useSoapFrom (KNOWNSOAPENGINES, optional) This string determines the selected SOAP engine for the calculations. Defaults to "dscribe". doOverride (bool, optional) if False will raise and exception if the user ask to override an already existing DataSet. Defaults to False. verbose (bool, optional): regulates the verbosity of the step by step operations. Defaults to True. useType (str,optional): The precision used to store the data. Defaults to "float64". """ for key in trajContainers.keys(): if isTrajectoryGroup(trajContainers[key]): saponifyTrajectory( trajContainer=trajContainers[key], SOAPoutContainer=SOAPoutContainers, SOAPrcut=SOAPrcut, SOAPnmax=SOAPnmax, SOAPlmax=SOAPlmax, SOAPOutputChunkDim=SOAPOutputChunkDim, SOAPnJobs=SOAPnJobs, SOAPatomMask=SOAPatomMask, centersMask=centersMask, SOAP_respectPBC=SOAP_respectPBC, SOAPkwargs=SOAPkwargs, useSoapFrom=useSoapFrom, doOverride=doOverride, verbose=verbose, useType=useType, )
[docs]def saponifyTrajectory( trajContainer: "h5py.Group|h5py.File", SOAPoutContainer: "h5py.Group|h5py.File", SOAPrcut: float, SOAPnmax: int, SOAPlmax: int, SOAPOutputChunkDim: int = 100, SOAPnJobs: int = 1, SOAPatomMask: str = None, centersMask: Iterable = None, SOAP_respectPBC: bool = True, SOAPkwargs: dict = None, useSoapFrom: KNOWNSOAPENGINES = "dscribe", doOverride: bool = False, verbose: bool = True, useType="float64", ): """Calculates the SOAP fingerprints for each atom in a given hdf5 trajectory Works exaclty as :func:`saponifyMultipleTrajectories` except for that it calculates the fingerprints only for the passed trajectory group (see :func:`SOAPify.HDF5er.HDF5erUtils.isTrajectoryGroup`). `SOAPatomMask` and `centersMask` are mutually exclusive (see :func:`SOAPify.engine.getSoapEngine`) Args: trajFname (str): The name of the hdf5 file in wich the trajectory is stored trajectoryGroupPath (str): the path of the group that contains the trajectory in trajFname outputFname (str): the name of the hdf5 file that will contain the ouput or the SOAP analysis exportDatasetName (str): the name of the dataset that will contain the SOAP results, it will be saved in the group called "SOAP" SOAPOutputChunkDim (int, optional): The dimension of the chunck of data in the SOAP results dataset. Defaults to 100. SOAPnJobs (int, optional): the number of concurrent SOAP calculations (option passed to the desired SOAP engine). Defaults to 1. SOAPatomMask (str, optional): the symbols of the atoms whose SOAP fingerprint will be calculated (option passed to the desired SOAP engine). Defaults to None. SOAPrcut (float, optional): The cutoff for local region in angstroms. Should be bigger than 1 angstrom (option passed to the desired SOAP engine). Defaults to 8.0. SOAPnmax (int, optional): The number of radial basis functions (option passed to the desired SOAP engine). Defaults to 8. SOAPlmax (int, optional): The maximum degree of spherical harmonics (option passed to the desired SOAP engine). Defaults to 8. SOAP_respectPBC (bool, optional): Determines whether the system is considered to be periodic (option passed to the desired SOAP engine). Defaults to True. SOAPkwargs (dict, optional): additional keyword arguments to be passed to the SOAP engine. Defaults to {}. useSoapFrom (KNOWNSOAPENGINES, optional): This string determines the selected SOAP engine for the calculations. Defaults to "dscribe". doOverride (bool, optional): if False will raise and exception if the user ask to override an already existing DataSet. Defaults to False. verbose (bool, optional): regulates the verbosity of the step by step operations. Defaults to True. useType (str,optional): The precision used to store the data. Defaults to "float64". """ if isTrajectoryGroup(trajContainer): print(f'using "{useSoapFrom}" to calculate SOAP for "{trajContainer.name}"') print("extra SOAP arguments:", SOAPkwargs) symbols = trajContainer["Types"].asstr()[:] soapEngine = getSoapEngine( atomNames=symbols, SOAPrcut=SOAPrcut, SOAPnmax=SOAPnmax, SOAPlmax=SOAPlmax, SOAPatomMask=SOAPatomMask, centersMask=centersMask, SOAP_respectPBC=SOAP_respectPBC, SOAPkwargs=SOAPkwargs, useSoapFrom=useSoapFrom, ) exportDatasetName = trajContainer.name.split("/")[-1] _applySOAP( trajContainer, SOAPoutContainer, exportDatasetName, soapEngine, SOAPOutputChunkDim, SOAPnJobs, doOverride=doOverride, verbose=verbose, useType=useType, ) else: raise ValueError("saponify: The input object is not a trajectory group.")