LENS analysis

an example for a LENS analysis as in the original paper

extra requirements:

matplotlib
sklearn
scipy
seaborn

[1]:

import SOAPify.HDF5er as HDF5er
import SOAPify
import h5py
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from matplotlib.patches import Circle
from matplotlib.colors import ListedColormap, BoundaryNorm
from scipy.signal import savgol_filter
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
from multiprocessing.pool import ThreadPool as Pool
from seaborn import kdeplot

cutoff = 2.88 * 1.1

Load data

Let’s get the data:

wget https://github.com/GMPavanLab/dynNP/releases/download/V1.0-trajectories/ico309.hdf5

We’ll start by caclulating the neighbours and the LENS parameters

using cutoff=2.88*1.1 that is 10% more than the Au radius

[2]:

wantedTrajectory = slice(0, None, 10)
trajFileName = "ico309.hdf5"
trajAddress = "/Trajectories/ico309-SV_18631-SL_31922-T_500"
with h5py.File(trajFileName, "r") as trajFile:
    tgroup = trajFile[trajAddress]
    universe = HDF5er.createUniverseFromSlice(tgroup, wantedTrajectory)

[3]:

nAtoms = len(universe.atoms)

[4]:

neigCounts = SOAPify.analysis.listNeighboursAlongTrajectory(universe, cutOff=cutoff)
LENS, nn, *_ = SOAPify.analysis.neighbourChangeInTime(neigCounts)

[5]:

fig, axes = plt.subplots(2, sharey=True)
for i in range(4):
    axes[0].plot(LENS[i], label=f"Atom {i}")
    axes[1].plot(LENS[nAtoms - 1 - i], label=f"Atom {nAtoms-1-i}")

for ax in axes:
    ax.legend()

[6]:

fig, axes = plt.subplots(2, sharey=True)
for i in range(4):
    axes[0].plot(nn[i], label=f"Atom {i}")
    axes[1].plot(nn[nAtoms - 1 - i], label=f"Atom {nAtoms-1-i}")

for ax in axes:
    ax.legend()

Filtering

Let’s chose an atom try to apply a filter on it. We want try to reduce the signal to noise ratio, so we calculate the mean of the s/n for all atoms:

[7]:

def signaltonoise(a: np.array, axis, ddof):
    """Given an array, retunrs its signal to noise value of its compontens"""
    m = a.mean(axis)
    sd = a.std(axis=axis, ddof=ddof)
    return 20 * np.log10(abs(np.where(sd == 0, 0, m / sd)))

[8]:

atom = nAtoms - 1
savGolPrint = np.arange(LENS[atom].shape[0])
windows = [10, 50, 100]
polyorders = [2, 4, 6]
fig, axes = plt.subplots(len(windows), sharey=True)

for ax, window_length in zip(axes, windows):
    windowToPlot = slice(window_length // 2, -window_length // 2)
    ax.plot(LENS[atom], label=f"Atom {atom}")
    for polyorder in polyorders:
        savgol = savgol_filter(
            LENS, window_length=window_length, polyorder=polyorder, axis=-1
        )

        sr_ratio = signaltonoise(savgol[:, windowToPlot], axis=-1, ddof=0)

        ax.plot(
            savGolPrint[windowToPlot],
            savgol[atom, windowToPlot],
            label=f"Filtered w={window_length}, p={polyorder} s/n={np.mean(sr_ratio):.2f}",
        )
    ax.legend(loc="upper left", bbox_to_anchor=(1, 1))

[9]:

windows = np.arange(10, 200, 10)
polyorders = [2, 4, 6]
fig, ax = plt.subplots(1)


for polyorder in polyorders:
    meansrRatios = np.empty((len(windows), 2))
    for i, window_length in enumerate(windows):
        windowToPlot = slice(window_length // 2, -window_length // 2)
        savgol = savgol_filter(
            LENS, window_length=window_length, polyorder=polyorder, axis=-1
        )
        sr_ratio = signaltonoise(savgol[:, windowToPlot], axis=-1, ddof=0)
        meansrRatios[i] = [window_length, np.mean(sr_ratio)]
    ax.plot(meansrRatios[:, 0], meansrRatios[:, 1], label=f"p={polyorder}")
ax.set_xlabel("Window Size", fontsize=20)
ax.set_ylabel("S/N [dB]", fontsize=20)

_ = ax.legend()

Like in the paper we will chose window_length=100 and polyorder=2, it as a 9.63 dB signal to noise ratio, that is quite accettable, and apply the filter to all of the LENS trajectories

[10]:

window_length = 100
polyorder = 2
windowToUSE = slice(window_length // 2, -window_length // 2)
filteredLENS = savgol_filter(
    LENS, window_length=window_length, polyorder=polyorder, axis=-1
)

We proceed with the clustering: we try to group the frames by their values using KMeans

silhouette_score is O(n^2), so we need a way to reduce its time

[11]:

def transposeAndFlatten(x, /):
    trans = np.transpose(x)
    trans_fl = np.reshape(trans, np.shape(trans)[0] * np.shape(trans)[1])
    return trans_fl


dataToFit = transposeAndFlatten(filteredLENS[:, windowToUSE])
print("minmax: ", np.min(dataToFit), np.max(dataToFit))

minmax:  -0.018211861305947333 0.6840905246350124

[12]:

dontSkipThisCell = False


# It is nont easy: to make Pool work in a interactive session: this cell should be
# executed as a standalone script
def createSilhouetteGraph():
    scores = np.zeros(10)

    def calculateScores(nclusters):
        clusters = KMeans(nclusters, random_state=12345).fit_predict(
            dataToFit.reshape(-1, 1)
        )
        score = silhouette_score(
            dataToFit.reshape(-1, 1),
            clusters,
            sample_size=np.sqrt(len(clusters)),
            random_state=12345,
        )
        print(nclusters, score)
        return score

    with Pool(10) as p:
        scores = p.map(calculateScores, list(range(2, 12)))

    fig, ax = plt.subplots(1, figsize=(5, 5), dpi=200)
    ax.plot(range(2, 12), scores)


if dontSkipThisCell:
    createSilhouetteGraph()

bestnclusters should be 2 in our case, but we still go with 4.

We store some of the informations from the clusters, we then proceed with some visualizations to understand if the number of cluster is ok

[13]:

bestnclusters = 4
# fixing the random state for reproducibility
clusters = KMeans(bestnclusters, random_state=12345).fit_predict(
    dataToFit.reshape(-1, 1)
)
data_KM = {}
for i in range(bestnclusters):
    data_KM[i] = {}
    data_KM[i]["elements"] = dataToFit[clusters == i]
    data_KM[i]["min"] = np.min(data_KM[i]["elements"])
    data_KM[i]["max"] = np.max(data_KM[i]["elements"])

[14]:

fig, axes = plt.subplots(
    1, 2, figsize=(6, 3), dpi=200, width_ratios=[3, 1.2], sharey=True
)

for flns in filteredLENS:
    axes[0].plot(
        savGolPrint[windowToUSE],
        flns[windowToUSE],
        color="k",
        linewidth=0.01,
        # alpha=0.9,
    )
hist = axes[1].hist(
    dataToFit,
    alpha=0.7,
    color="gray",
    bins="doane",
    density=True,
    orientation="horizontal",
)
kde = kdeplot(
    y=dataToFit,
    bw_adjust=1.5,
    linewidth=2,
    color="black",
    # gridsize=500,
    gridsize=2 * (hist[1].shape[0] - 1),
    ax=axes[1],
)
axes[0].set_title('LENS "trajectories"')
axes[0].set_xlim(0, len(savGolPrint))
axes[1].set_title("Density of\nLENS values")
height = np.max(hist[0])
for cname, data in data_KM.items():
    # center = delta+data['min']
    bin_size = data["max"] - data["min"]
    for i, ax in enumerate(axes):
        ax.barh(
            y=data["min"],
            width=len(savGolPrint) if i == 0 else height,
            height=bin_size,
            align="edge",
            alpha=0.25,
            # color=colors[idx_m],
            linewidth=2,
        )
        ax.axhline(y=data["min"], c="red", linewidth=2, linestyle=":")
        if i == 1:
            ax.text(
                -0.25,
                data["min"] + bin_size / 2.0,
                f"c{cname}",
                va="center",
                ha="right",
                fontsize=18,
            )

The idea is to play with the clustering until you are satisfied, then to visualize and try to give an interpretation (and then change agani the method of clustering until you are satisfied):

Visualization: ovito-compatible

Let’s define some helper functions and then export the data in a xyz that is compatible with ovito

[15]:

def prepareData(x, /):
    """prepares an array from shape (atom,frames) to  (frames,atom)"""
    shape = x.shape
    toret = np.empty((shape[1], shape[0]), dtype=x.dtype)
    for i, atomvalues in enumerate(x):
        toret[:, i] = atomvalues
    return toret


# classyfing by knoing the min/max of the clusters
def classifying(x, classDict):
    toret = np.ones_like(x, dtype=int) * (len(classDict) - 1)
    # todo: sort  by max and then classify
    minmax = [[cname, data["max"]] for cname, data in data_KM.items()]
    minmax = sorted(minmax, key=lambda x: -x[1])
    # print(minmax)
    for cname, myMax in minmax:
        toret[x < myMax] = int(cname)
    return toret


# classifying(filteredLENS,data_KM)
classifiedFilteredLENS = classifying(filteredLENS, data_KM)

[16]:

def export(wantedTrajectory):
    # as a function, so the ref universe should be garbage collected correctly
    with h5py.File(trajFileName, "r") as trajFile, open(
        "outClassifiedLENS.xyz", "w"
    ) as xyzFile:
        from MDAnalysis.transformations import fit_rot_trans

        tgroup = trajFile[trajAddress]
        ref = HDF5er.createUniverseFromSlice(tgroup, [0])
        nAt= len(ref.atoms)
        ref.add_TopologyAttr("mass", [1] * nAt)
        # load antohter univer to avoid conatmination in the main one
        exportuniverse = HDF5er.createUniverseFromSlice(tgroup, wantedTrajectory)
        exportuniverse.add_TopologyAttr("mass", [1] * nAt)
        exportuniverse.trajectory.add_transformations(fit_rot_trans(exportuniverse, ref))
        HDF5er.getXYZfromMDA(
            xyzFile,
            exportuniverse,
            # framesToExport=wantedTrajectory,
            allFramesProperty='Origin="-40 -40 -40"',
            LENS=prepareData(LENS),
            filteredLENS=prepareData(filteredLENS),
            proposedClassification=prepareData(classifiedFilteredLENS),
        )
        universe.trajectory


export(wantedTrajectory)

Let’s visualize a few LENS trajectories colored by the cluster in time

Time analysis and visualizations

Let’s calculate the transition matrix for this LENS analysis

[17]:

minmax = [[cname, data["max"]] for cname, data in data_KM.items()]
minmax = sorted(minmax, key=lambda x: x[1])
cmap = ListedColormap([f"C{m[0]}" for m in minmax])
norm = BoundaryNorm([0] + [m[1] for m in minmax], cmap.N)

[18]:

from SOAPify import SOAPclassification, calculateTransitionMatrix, normalizeMatrixByRow
from seaborn import heatmap

classifications = SOAPclassification(
    [], prepareData(classifiedFilteredLENS), [f"C{m[0]}" for m in minmax]
)
tmat = calculateTransitionMatrix(classifications)
tmat = normalizeMatrixByRow(tmat)
_, ax = plt.subplots(1)
heatmap(
    tmat,
    vmax=1,
    vmin=0,
    cmap="rocket_r",
    annot=True,
    xticklabels=classifications.legend,
    yticklabels=classifications.legend,
    ax=ax,
)

for i, label in enumerate(classifications.legend):
    ax.add_patch(
        Circle(
            (0.5 + i, len(classifications.legend) + 0.25),
            facecolor=label,
            lw=1.5,
            edgecolor="black",
            radius=0.2,
            clip_on=False,
        )
    )
    ax.add_patch(
        Circle(
            (-0.25, 0.5 + i),
            facecolor=label,
            lw=1.5,
            edgecolor="black",
            radius=0.2,
            clip_on=False,
        )
    )

[19]:

chosenAtoms=[0, nAtoms //3,nAtoms//2,int(nAtoms//3)*2, nAtoms - 1]
fig, axes = plt.subplots(len(chosenAtoms),figsize=(8,1.5*len(chosenAtoms)), sharey=True, sharex=True)

for atom, ax in zip(chosenAtoms, axes):
    points = np.array([savGolPrint, filteredLENS[atom]]).T.reshape(-1, 1, 2)
    segments = np.concatenate([points[:-1], points[1:]], axis=1)
    lc = LineCollection(segments, cmap=cmap, norm=norm)
    lc.set_array(filteredLENS[atom])

    line = ax.add_collection(lc)
    # ax.scatter(savGolPrint,filteredLENS[atom],c=classifiedFilteredLENS[atom])
    ax.set_title(f"Atom {atom}")

axes[0].set_xlim(savGolPrint[0] - 1, savGolPrint[-1] + 1)
_ = axes[0].set_ylim(-0.02, np.max(filteredLENS) + 0.02)