timeSOAP analysis

an example for a timeSOAP analysis as in the original paper

[1]:
import SOAPify.HDF5er as HDF5er
from SOAPify import (
    saponifyTrajectory,
    fillSOAPVectorFromdscribe,
    normalizeArray,
    getSOAPSettings,
)
from SOAPify.analysis import timeSOAPsimple
import h5py
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from matplotlib.patches import Circle
from matplotlib.colors import ListedColormap, BoundaryNorm
from scipy.signal import savgol_filter
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
from multiprocessing.pool import ThreadPool as Pool
from seaborn import kdeplot

SOAPrcut = 4.48023312
SOAPnmax = 8
SOAPlmax = 8

Load data

Let’s get the data:

wget https://github.com/GMPavanLab/dynNP/releases/download/V1.0-trajectories/ico309.hdf5

We’ll start by caclulating the SOAP fingerprints of the 500 K simulation

using lMax=8, nMax=8, and cutoff=4.48023312 that is 10% more than the Au cell

[2]:
trajFileName = "ico309.hdf5"
trajAddress = "ico309-SV_18631-SL_31922-T_500"

[3]:
def prepareSOAP(trajFileName, trajAddress):
    soapFileName = trajFileName.split(".")[0] + "soap.hdf5"
    print(trajFileName, soapFileName)
    with h5py.File(trajFileName, "r") as workFile, h5py.File(
        soapFileName, "a"
    ) as soapFile:
        soapFile.require_group("SOAP")
        # skips if the soap trajectory is already present
        if trajAddress not in soapFile["SOAP"]:
            saponifyTrajectory(
                trajContainer=workFile[f"Trajectories/{trajAddress}"],
                SOAPoutContainer=soapFile["SOAP"],
                SOAPOutputChunkDim=1000,
                SOAPnJobs=32,
                SOAPrcut=SOAPrcut,
                SOAPnmax=SOAPnmax,
                SOAPlmax=SOAPlmax,
            )
    return soapFileName


soapFileName = prepareSOAP(trajFileName, trajAddress)

ico309.hdf5 ico309soap.hdf5

Let’s get the raw tempSOAP array without clogging up the memory

There is an improved version of the action in shis cell in the analysis submodule: analysis.getTimeSOAPSimple

[4]:
def getTimeSOAP(soapFileName, trajAddress):
    with h5py.File(soapFileName, "r") as f:
        ds = f[f"/SOAP/{trajAddress}"]
        fillSettings = getSOAPSettings(ds)
        print(fillSettings)
        print(ds.shape)
        nAt = ds.shape[1]

        timedSOAP = np.zeros((ds.shape[0] - 1, ds.shape[1]))

        print(timedSOAP.shape)
        slide = 0
        # this looks a lot convoluted, but it is way faster than working one atom
        # at a time
        for c in ds.iter_chunks():
            theSlice = slice(c[0].start - slide, c[0].stop, c[0].step)
            outSlice = slice(c[0].start - slide, c[0].stop - 1, c[0].step)
            print(c[0], theSlice, outSlice)
            timedSOAP[outSlice], _ = timeSOAPsimple(
                normalizeArray(fillSOAPVectorFromdscribe(ds[theSlice], **fillSettings))
            )
            slide = 1

        return nAt, timedSOAP, np.diff(timedSOAP.T, axis=-1)


nAtoms, tSOAP, dtSOAP = getTimeSOAP(soapFileName, trajAddress)

{'nMax': 8, 'lMax': 8, 'atomTypes': array(['Au'], dtype=object), 'atomicSlices': {'AuAu': slice(0, 324, None)}}
(20000, 309, 324)
(19999, 309)
slice(0, 1000, 1) slice(0, 1000, 1) slice(0, 999, 1)
slice(1000, 2000, 1) slice(999, 2000, 1) slice(999, 1999, 1)
slice(2000, 3000, 1) slice(1999, 3000, 1) slice(1999, 2999, 1)
slice(3000, 4000, 1) slice(2999, 4000, 1) slice(2999, 3999, 1)
slice(4000, 5000, 1) slice(3999, 5000, 1) slice(3999, 4999, 1)
slice(5000, 6000, 1) slice(4999, 6000, 1) slice(4999, 5999, 1)
slice(6000, 7000, 1) slice(5999, 7000, 1) slice(5999, 6999, 1)
slice(7000, 8000, 1) slice(6999, 8000, 1) slice(6999, 7999, 1)
slice(8000, 9000, 1) slice(7999, 9000, 1) slice(7999, 8999, 1)
slice(9000, 10000, 1) slice(8999, 10000, 1) slice(8999, 9999, 1)
slice(10000, 11000, 1) slice(9999, 11000, 1) slice(9999, 10999, 1)
slice(11000, 12000, 1) slice(10999, 12000, 1) slice(10999, 11999, 1)
slice(12000, 13000, 1) slice(11999, 13000, 1) slice(11999, 12999, 1)
slice(13000, 14000, 1) slice(12999, 14000, 1) slice(12999, 13999, 1)
slice(14000, 15000, 1) slice(13999, 15000, 1) slice(13999, 14999, 1)
slice(15000, 16000, 1) slice(14999, 16000, 1) slice(14999, 15999, 1)
slice(16000, 17000, 1) slice(15999, 17000, 1) slice(15999, 16999, 1)
slice(17000, 18000, 1) slice(16999, 18000, 1) slice(16999, 17999, 1)
slice(18000, 19000, 1) slice(17999, 19000, 1) slice(17999, 18999, 1)
slice(19000, 20000, 1) slice(18999, 20000, 1) slice(18999, 19999, 1)
[5]:
print(tSOAP.shape)
print(dtSOAP.shape)

(19999, 309)
(309, 19998)
[6]:
fig, axes = plt.subplots(2, sharey=True)
for i in range(4):
    axes[0].plot(tSOAP[:, i], label=f"Atom {i}")
    axes[1].plot(tSOAP[:, nAtoms - 1 - i], label=f"Atom {nAtoms-1-i}")

for ax in axes:
    ax.legend()

_images/timeSOAP_8_0.png
[7]:
fig, axes = plt.subplots(2, sharey=True)
for i in range(4):
    axes[0].plot(dtSOAP[i], label=f"Atom {i}")
    axes[1].plot(dtSOAP[308 - i], label=f"Atom {308-i}")

for ax in axes:
    ax.legend()

_images/timeSOAP_9_0.png

Filtering

Let’s chose an atom try to apply a filter on it. We want try to reduce the signal to noise ratio, so we calculate the mean of the s/n for all atoms:

[8]:
def signaltonoise(a: np.array, axis, ddof):
    """Given an array, retunrs its signal to noise value of its compontens"""
    m = a.mean(axis)
    sd = a.std(axis=axis, ddof=ddof)
    return 20 * np.log10(abs(np.where(sd == 0, 0, m / sd)))

Here we show the effect of the savitszky-golay filtering on the last atom. In the legend we show the mean (on all the atoms) signal to noise ration (s/n).

[9]:
atom = nAtoms - 1
savGolPrint = np.arange(tSOAP.T[atom].shape[0])
windows = [100, 200, 360]
polyorders = [2, 4, 6]
fig, axes = plt.subplots(len(windows), sharey=True)

for ax, window_length in zip(axes, windows):
    windowToPlot = slice(window_length // 2, -window_length // 2)
    ax.plot(tSOAP.T[atom], label=f"Atom {atom}")
    for polyorder in polyorders:
        savgol = savgol_filter(
            tSOAP.T, window_length=window_length, polyorder=polyorder, axis=-1
        )

        sr_ratio = signaltonoise(savgol[:, windowToPlot], axis=-1, ddof=0)

        ax.plot(
            savGolPrint[windowToPlot],
            savgol[atom, windowToPlot],
            label=f"Filtered w={window_length}, p={polyorder} s/n={np.mean(sr_ratio):.2f}",
        )
    ax.legend(loc="upper left", bbox_to_anchor=(1, 1))

_images/timeSOAP_13_0.png

Here we show the evolution of the mean s/n ratio with the change in the parameters of the savitszky-golay for these signals

[10]:
windows = np.arange(10, 600, 20)
polyorders = [2, 4, 6]
fig, ax = plt.subplots(1)


for polyorder in polyorders:
    meansrRatios = np.empty((len(windows), 2))
    for i, window_length in enumerate(windows):
        windowToPlot = slice(window_length // 2, -window_length // 2)
        savgol = savgol_filter(
            tSOAP.T, window_length=window_length, polyorder=polyorder, axis=-1
        )
        sr_ratio = signaltonoise(savgol[:, windowToPlot], axis=-1, ddof=0)
        meansrRatios[i] = [window_length, np.mean(sr_ratio)]
    ax.plot(meansrRatios[:, 0], meansrRatios[:, 1], label=f"p={polyorder}")
ax.set_xlabel("Window Size", fontsize=20)
ax.set_ylabel("S/N [dB]", fontsize=20)

_ = ax.legend()

_images/timeSOAP_15_0.png

We proceed with window_length=360 and polyorder=2, it as a 8.99 dB signal to noise ratio, that is quite accettable,

with these parameters we have a good compromise between the window and the s/n ration

[11]:
window_length = 360
polyorder = 2
windowToUSE = slice(window_length // 2, -window_length // 2)
filteredtSOAP = savgol_filter(
    tSOAP.T, window_length=window_length, polyorder=polyorder, axis=-1
)

We chose to execute a K-Means clustering oh the values of the signals, using 5 as the number of clusters

[12]:
bestnclusters = 5
clusters = KMeans(bestnclusters, random_state=12345).fit_predict(
    filteredtSOAP.reshape(-1, 1)
)
data_KM = {}
for i in range(bestnclusters):
    data_KM[i] = {}
    data_KM[i]["elements"] = filteredtSOAP.reshape((-1))[clusters == i]
    data_KM[i]["min"] = np.min(data_KM[i]["elements"])
    data_KM[i]["max"] = np.max(data_KM[i]["elements"])

[13]:
fig, axes = plt.subplots(
    1, 2, figsize=(6, 3), dpi=200, width_ratios=[3, 1.2], sharey=True
)

for flns in filteredtSOAP:
    axes[0].plot(
        savGolPrint[windowToUSE],
        flns[windowToUSE],
        color="k",
        linewidth=0.01,
        # alpha=0.9,
    )
hist = axes[1].hist(
    filteredtSOAP.reshape((-1)),
    alpha=0.7,
    color="gray",
    bins="doane",
    density=True,
    orientation="horizontal",
)
kde = kdeplot(
    y=filteredtSOAP.reshape((-1)),
    bw_adjust=1.5,
    linewidth=2,
    color="black",
    # gridsize=500,
    gridsize=2 * (hist[1].shape[0] - 1),
    ax=axes[1],
)
axes[0].set_title('tSOAP "trajectories"')
axes[0].set_xlim(0, len(savGolPrint))
axes[1].set_title("Density of\ntSOAP values")
height = np.max(hist[0])
for cname, data in data_KM.items():
    # center = delta+data['min']
    bin_size = data["max"] - data["min"]
    for i, ax in enumerate(axes):
        ax.barh(
            y=data["min"],
            width=len(savGolPrint) if i == 0 else height,
            height=bin_size,
            align="edge",
            alpha=0.25,
            # color=colors[idx_m],
            linewidth=2,
        )
        ax.axhline(y=data["min"], c="red", linewidth=2, linestyle=":")
        if i == 1:
            ax.text(
                -0.25,
                data["min"] + bin_size / 2.0,
                f"c{cname}",
                va="center",
                ha="right",
                fontsize=18,
            )

_images/timeSOAP_20_0.png

The idea is to play with the clustering until you are satisfied, then to visualize and try to give an interpretation (and then change agani the method of clustering until you are satisfied):

Visualization: ovito-compatible

Let’s define some helper functions and then export the data in a xyz that is compatible with ovito

[14]:
def prepareData(x, /):
    """prepares an array from shape (atom,frames) to  (frames,atom)"""
    shape = x.shape
    toret = np.empty((shape[1], shape[0]), dtype=x.dtype)
    for i, atomvalues in enumerate(x):
        toret[:, i] = atomvalues
    return toret


# classyfing by knoing the min/max of the clusters
def classifying(x, classDict):
    toret = np.ones_like(x, dtype=int) * (len(classDict) - 1)
    # todo: sort  by max and then classify
    minmax = [[cname, data["max"]] for cname, data in data_KM.items()]
    minmax = sorted(minmax, key=lambda x: -x[1])
    # print(minmax)
    for cname, myMax in minmax:
        toret[x < myMax] = int(cname)
    return toret


classifiedFilteredtSOAP = classifying(filteredtSOAP, data_KM)

[15]:
def export(trajFileName, trajAddress):
    # as a function, so the universe should be garbage collected correctly
    with h5py.File(trajFileName, "r") as trajFile, open(
        "outClassifiedTimeSOAP.xyz", "w"
    ) as xyzFile:
        from MDAnalysis.transformations import fit_rot_trans

        wantedTrajectory = slice(-1)

        tgroup = trajFile[f"/Trajectories/{trajAddress}"]
        universe = HDF5er.createUniverseFromSlice(tgroup, wantedTrajectory)
        nAt= len(universe.atoms)
        universe.add_TopologyAttr("mass", [1] * nAt)
        ref = HDF5er.createUniverseFromSlice(tgroup, [0])
        ref.add_TopologyAttr("mass", [1] * nAt)
        universe.trajectory.add_transformations(fit_rot_trans(universe, ref))

        HDF5er.getXYZfromMDA(
            xyzFile,
            universe,
            allFramesProperty='Origin="-40 -40 -40"',
            tSOAP=tSOAP,
            filteredtSOAP=prepareData(filteredtSOAP),
            proposedClassification=prepareData(classifiedFilteredtSOAP),
        )


export(trajFileName, trajAddress)

Let’s visualize a few tSOAP trajectories colored by the cluster in time

Time analysis and visualizations

Let’s calculate the transition matrix for this tSOAP analysis

[16]:
minmax = [[cname, data["max"]] for cname, data in data_KM.items()]
minmax = sorted(minmax, key=lambda x: x[1])
cmap = ListedColormap([f"C{m[0]}" for m in minmax])
norm = BoundaryNorm([0] + [m[1] for m in minmax], cmap.N)

[17]:
from SOAPify import SOAPclassification, calculateTransitionMatrix, normalizeMatrixByRow
from seaborn import heatmap

classifications = SOAPclassification(
    [], prepareData(classifiedFilteredtSOAP), [f"C{m[0]}" for m in minmax]
)
tmat = calculateTransitionMatrix(classifications)
tmat = normalizeMatrixByRow(tmat)
_, ax = plt.subplots(1)
heatmap(
    tmat,
    vmax=1,
    vmin=0,
    cmap="rocket_r",
    annot=True,
    xticklabels=classifications.legend,
    yticklabels=classifications.legend,
    ax=ax,
)

for i, label in enumerate(classifications.legend):
    ax.add_patch(
        Circle(
            (0.5 + i, len(classifications.legend) + 0.25),
            facecolor=label,
            lw=1.5,
            edgecolor="black",
            radius=0.2,
            clip_on=False,
        )
    )
    ax.add_patch(
        Circle(
            (-0.25, 0.5 + i),
            facecolor=label,
            lw=1.5,
            edgecolor="black",
            radius=0.2,
            clip_on=False,
        )
    )

_images/timeSOAP_27_0.png
[18]:
chosenAtoms=[0, nAtoms //3,nAtoms//2,int(nAtoms//3)*2, nAtoms - 1]
fig, axes = plt.subplots(len(chosenAtoms),figsize=(8,1.5*len(chosenAtoms)), sharey=True, sharex=True)

for atom, ax in zip(chosenAtoms, axes):
    points = np.array([savGolPrint, filteredtSOAP[atom]]).T.reshape(-1, 1, 2)
    segments = np.concatenate([points[:-1], points[1:]], axis=1)
    lc = LineCollection(segments, cmap=cmap, norm=norm)
    lc.set_array(filteredtSOAP[atom])

    line = ax.add_collection(lc)
    ax.set_title(f"Atom {atom}")

axes[0].set_xlim(savGolPrint[0] - 1, savGolPrint[-1] + 1)
_ = axes[0].set_ylim(-0.02, np.max(filteredtSOAP) + 0.02)

_images/timeSOAP_28_0.png