Source code for SOAPify.HDF5er.HDF5To

"""This submodule gives the user some function to extract data from the hdf5 files"""
from typing import IO, List
import re
import MDAnalysis
import h5py
from ase import Atoms as aseAtoms
import numpy
from MDAnalysis.lib.mdamath import triclinic_vectors

__all__ = [
    "getXYZfromTrajGroup",
    "saveXYZfromTrajGroup",
    "HDF52AseAtomsChunckedwithSymbols",
    "getXYZfromMDA",
    "createUniverseFromSlice",
]


# TODO: using slices is not the best compromise here
# TODO: maybe it is better to make this and iterator/generator
[docs]def HDF52AseAtomsChunckedwithSymbols(
    groupTraj: h5py.Group,
    chunkTraj: "tuple[slice]",
    chunkBox: "tuple[slice]",
    symbols: "list[str]",
) -> "list[aseAtoms]":
    """generates an ase trajectory from an hdf5 trajectory

    Args:
        groupTraj (h5py.Group):
            the group within the hdf5 file where the trajectroy is stored
        chunkTraj (tuple[slice]):
            the list of the chunks of the trajectory within the given group
        chunkBox (tuple[slice]):
            the list of the chunks of the frame boxes within the given group
        symbols (list[str]):
            the list of the name of the atoms


    Returns:
        list[ase.Atoms]: the trajectory stored in the given group
    """
    atoms = []

    for frame, box in zip(
        groupTraj["Trajectory"][chunkTraj], groupTraj["Box"][chunkBox]
    ):
        # theBox = [[box[0], 0, 0], [0, box[1], 0], [0, 0, box[2]]]
        # celldisp = -box[0:3] / 2
        atoms.append(
            aseAtoms(
                symbols=symbols,
                positions=frame,
                cell=box,
                pbc=True,
                # celldisp=celldisp,
            )
        )
    return atoms


def __prepareHeaders(
    additionalColumns: dict,
    nframes: int,
    nat: int,
    allFramesProperty: str = None,
    perFrameProperties: "list[str]" = None,
) -> str:
    """Generates the static part of the header and perform sanity checks on optional data

    Args:
        additionalColumns (dict):
            the dictionary of the values of the additional columns
        nframes (int):
            the total number of frame, used for sanity checks
        nat (int):
            the total number of atoms, used for sanity checks (n is fixed)
        allFramesProperty (str, optional):
            the property common to all frames.
            Defaults to None.
        perFrameProperties (list[str], optional):
            list of properties that changes frame per frame.
            Defaults to None.

    Raises:
        ValueError:
            if additionalColumns is ill-formed
        ValueError:
            if perFrameProperties is ill-formed

    Returns:
        str: the static part of the header
    """
    additional = ""
    for key in additionalColumns:
        shapeOfData = additionalColumns[key].shape
        dim = shapeOfData[2] if len(shapeOfData) == 3 else 1
        if (  # wrong shape of the array
            (len(shapeOfData) != 2 and len(shapeOfData) != 3)
            # different data from number of frames
            or shapeOfData[0] != nframes
            # wrong number of atoms
            or shapeOfData[1] != nat
        ):
            raise ValueError(
                'Extra data passed to "getXYZfromTrajGroup" do not has the right dimensions'
                + f"\n(Trajectory shape:{(nframes,nat)}, data {key} shape:{shapeOfData})"
            )
        additional += ":" + key + ":R:" + str(dim)
    if perFrameProperties is not None:
        if len(perFrameProperties) != nframes:
            raise ValueError(
                "perFrameProperties do not have the same lenght of the trajectory"
            )
    return f"{nat}\nProperties=species:S:1:pos:R:3{additional} {allFramesProperty}"


[docs]def getXYZfromTrajGroup(
    filelike: IO,
    group: h5py.Group,
    framesToExport: "List | slice" = slice(None),
    allFramesProperty: str = "",
    perFrameProperties: "list[str]" = None,
    **additionalColumns,
) -> None:
    """generate an xyz-file in a IO object from a trajectory group in an hdf5

        the additionalColumns arguments are added as extra columns to the file,
        they must be numpy.ndarray with shape (nofFrames,NofAtoms) for 1D data
        or (nofFrames,NofAtoms,Nvalues) for multidimensional data
        this will add one or more columns to the xyz file, named after the keyword
        argument

    Args:
        filelike (IO):
            the IO destination, can be a file
        group (h5py.Group):
            the trajectory group
        frames (List or slice or None, optional):
            the frames to export. Defaults to None.
        allFramesProperty (str, optional):
            A comment string that will be present in all of the frames.
            Defaults to "".
        perFrameProperties (list[str], optional):
            A list of comment.
            Defaults to None.
        additionalColumns():
            the additional columns to add to the file: each new keyword arguments
            will add a column to the xyz file
    """

    atomtypes = group["Types"].asstr()
    boxes: h5py.Dataset = group["Box"][framesToExport]
    coordData: h5py.Dataset = group["Trajectory"][framesToExport]

    trajlen: int = coordData.shape[0]
    nat: int = coordData.shape[1]

    header: str = __prepareHeaders(
        additionalColumns, nframes=trajlen, nat=nat, allFramesProperty=allFramesProperty
    )

    for frameIndex in range(trajlen):
        coord = coordData[frameIndex, :]
        data = __writeAframe(
            header,
            nat,
            atomtypes,
            coord,
            boxes[frameIndex],
            perFrameProperties[frameIndex] if perFrameProperties is not None else None,
            # this may create a bottleneck
            **{k: additionalColumns[k][frameIndex] for k in additionalColumns},
        )
        filelike.write(data)


# TODO: it is slow with huge files
[docs]def saveXYZfromTrajGroup(
    filename: str,
    group: h5py.Group,
    framesToExport: "List | slice" = slice(None),
    allFramesProperty: str = "",
    perFrameProperties: "list[str]" = None,
    **additionalColumns,
) -> None:
    """Saves "filename" as an xyz file

    see saveXYZfromTrajGroup this calls getXYZfromTrajGroup and treats the inputs in the same way

    Args:
        filename (str):
            name of the file
        group (h5py.Group):
            the trajectory group
        additionalColumsn():
            the additional columns to add to the file
        allFramesProperty (str, optional):
            A comment string that will be present in all of the frames.
            Defaults to "".
        perFrameProperties (list[str], optional):
            A list of comment.
            Defaults to None.
    """
    with open(filename, "w") as file:
        getXYZfromTrajGroup(
            file,
            group,
            framesToExport,
            allFramesProperty,
            perFrameProperties,
            **additionalColumns,
        )


[docs]def getXYZfromMDA(
    filelike: IO,
    trajToExport: "MDAnalysis.Universe | MDAnalysis.AtomGroup",
    framesToExport: "List | slice" = slice(None),
    allFramesProperty: str = "",
    perFrameProperties: "list[str]" = None,
    **additionalColumns,
) -> None:
    """generate an xyz-file in a IO object from an MDA trajectory

        the additionalColumns arguments are added as extra columns to the file,
        they must be numpy.ndarray with shape (nofFrames,NofAtoms) for 1D data
        or (nofFrames,NofAtoms,Nvalues) for multidimensional data
        this will add one or more columns to the xyz file

    Args:
        filelike (IO):
            the IO destination, can be a file
        group (MDAnalysis.Universe | MDAnalysis.AtomGroup):
            the universe or the selection of atoms to export
        frames (List or slice or None, optional):
            the frames to export.
            Defaults to None.
        allFramesProperty (str, optional):
            A comment string that will be present in all of the frames.
            Defaults to "".
        perFrameProperties (list[str], optional):
            A list of comment.
            Defaults to None.
        additionalColumns():
            the additional columns to add to the file: each new keyword arguments
            will add a column to the xyz file
    """

    atoms = trajToExport.atoms
    universe = trajToExport.universe
    atomtypes = atoms.types
    coordData: "MDAnalysis.Universe | MDAnalysis.AtomGroup" = universe.trajectory[
        framesToExport
    ]

    trajlen: int = len(coordData)
    nat: int = len(atoms)

    header: str = __prepareHeaders(
        additionalColumns, nframes=trajlen, nat=nat, allFramesProperty=allFramesProperty
    )
    for frameIndex, _ in enumerate(universe.trajectory[framesToExport]):
        coord = atoms.positions
        data = __writeAframe(
            header,
            nat,
            atomtypes,
            coord,
            universe.dimensions,
            perFrameProperties[frameIndex] if perFrameProperties is not None else None,
            # this may create a bottleneck
            **{k: additionalColumns[k][frameIndex] for k in additionalColumns},
        )
        filelike.write(data)


def __writeAframe(
    header,
    nat,
    atomtypes,
    coord,
    boxDimensions,
    perFrameProperty: str = None,
    **additionalColumns,
) -> str:
    data = f"{header}"
    data += f" {perFrameProperty}" if perFrameProperty is not None else ""
    theBox = triclinic_vectors(boxDimensions)
    data += f' Lattice="{theBox[0][0]} {theBox[0][1]} {theBox[0][2]} '
    data += f"{theBox[1][0]} {theBox[1][1]} {theBox[1][2]} "
    data += f'{theBox[2][0]} {theBox[2][1]} {theBox[2][2]}"'
    data += "\n"

    for atomID in range(nat):
        data += (
            f"{atomtypes[atomID]} {coord[atomID,0]} {coord[atomID,1]} {coord[atomID,2]}"
        )
        for key in additionalColumns:
            # this removes the brackets from the data if the dimensions are >1
            data += " " + re.sub("( \[|\[|\])", "", str(additionalColumns[key][atomID]))
        data += "\n"
    return data


[docs]def createUniverseFromSlice(
    trajectoryGroup: h5py.Group, useSlice=slice(None)
) -> MDAnalysis.Universe:
    """Creates a MDanalysis.Universe from a trajectory group

    Args:
        trajectoryGroup (h5py.Group):
            the given trajectory group
        useSlice (_type_, optional):
            the asked slice from wich create an universe.
            Defaults to slice(None).

    Returns:
        MDAnalysis.Universe:
            an universe containing the wnated part of the trajectory
    """
    # TODO: also add a slice for the atoms
    traj = trajectoryGroup["Trajectory"]
    box = trajectoryGroup["Box"]
    atomNames = trajectoryGroup["Types"].asstr()
    nAt = traj.shape[1]
    # TODO add names
    toRet = MDAnalysis.Universe.empty(
        n_atoms=nAt,
        n_residues=nAt,
        n_segments=1,
        atom_resindex=numpy.arange(nAt),
        residue_segindex=[1] * nAt,
        trajectory=True,
    )
    toRet.add_TopologyAttr("type", atomNames)
    toRet.load_new(
        traj[useSlice],
        format=MDAnalysis.coordinates.memory.MemoryReader,
        dimensions=box[useSlice],
    )

    return toRet