Source code for ai4materials.descriptors.prdf

# coding=utf-8
# Copyright 2016-2018 Angelo Ziletti
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

__author__ = "Angelo Ziletti"
__copyright__ = "Copyright 2018, Angelo Ziletti"
__maintainer__ = "Angelo Ziletti"
__email__ = "ziletti@fhi-berlin.mpg.de"
__date__ = "23/09/18"

from ase.data import chemical_symbols
from future.utils import viewitems
import logging
import math
from ai4materials.descriptors.base_descriptor import Descriptor
from ai4materials.descriptors.base_descriptor import is_descriptor_consistent
import numpy as np
import os
import scipy.sparse
logger = logging.getLogger('ai4materials')


[docs]class PRDF(Descriptor):
    """Compute the partial radial distribution of a given crystal structure.

    Cell vectors v1,v2,v3 with values in the columns: [[v1x,v2x,v3x],[v1y,v2y,v3x],[v1z,v2z,v3z]]

    Parameters:

    cutoff_radius: float, optional (default=20)
        Atoms within a sphere of cut-off radius (in Angstrom) are considered.

    rdf_only: bool, optional (defaults=`False`)
        If `False` calculates partial radial distribution function.
        If `True` calculates radial distribution function (all atom types are considered as the same)

    .. codeauthor:: Fawzi Mohamed <mohamed@fhi-berlin.mpg.de> and Angelo Ziletti <angelo.ziletti@gmail.com>

    """

    def __init__(self, configs=None, cutoff_radius=20, rdf_only=False):
        super(PRDF, self).__init__(configs=configs)

        self.cutoff_radius = cutoff_radius
        self.rdf_only = rdf_only

        if not self.rdf_only:
            logger.info("Calculating partial radial distribution function.")
        else:
            logger.info("Calculating radial distribution function; all atomic species are considered as the same.")

[docs]    def calculate(self, structure, **kwargs):
        """Calculate the descriptor for the given ASE structure.

        Parameters:

        structure: `ase.Atoms` object
            Atomic structure.

        .. codeauthor:: Fawzi Mohamed <mohamed@fhi-berlin.mpg.de>

        """

        if not np.all(structure.pbc):
            raise NotImplementedError("At present, the partial radial distribution function is implemented only for "
                                      "three-dimensional periodic structures.")
        else:
            cell = structure.get_cell()
            cutoff_radius2 = self.cutoff_radius * self.cutoff_radius

            radD = {}
            for ii, a1 in enumerate(structure):
                for jj, a2 in enumerate(structure[ii:]):

                    # write atomic numbers if prdf, otherwise set them to zero
                    if not self.rdf_only:
                        n1 = a1.number
                        n2 = a2.number
                    else:
                        n1 = 0
                        n2 = 0

                    label = "%d_%d" % (min(n1, n2), max(n1, n2))
                    if label not in radD:
                        radD[label] = {"particle_atom_number_1": n1, "particle_atom_number_2": n2, "arr": []}
                    arr = radD[label]["arr"]
                    r0 = np.array([a1.x - a2.x, a1.y - a2.y, a1.z - a2.z])
                    r02 = np.dot(r0, r0)
                    m = np.dot(cell.transpose(), cell)
                    l = np.dot(r0, cell)
                    r = np.dot(r0, r0) - cutoff_radius2
                    mii = m[0, 0]
                    mij = m[0, 1]
                    mik = m[0, 2]
                    mjj = m[1, 1]
                    mjk = m[1, 2]
                    mkk = m[2, 2]
                    li = l[0]
                    lj = l[1]
                    lk = l[2]

                    c = (
                        mjj ** 2 * mkk ** 3 * r - 2 * mjj * mjk ** 2 * mkk ** 2 * r + mjk ** 4 * mkk * r - lj ** 2 * mjj * mkk ** 3 + lj ** 2 * mjk ** 2 * mkk ** 2 + 2 * lj * lk * mjj * mjk * mkk ** 2 - lk ** 2 * mjj ** 2 * mkk ** 2 - 2 * lj * lk * mjk ** 3 * mkk + lk ** 2 * mjj * mjk ** 2 * mkk)

                    a = (
                        mii * mjj ** 2 * mkk ** 3 - mij ** 2 * mjj * mkk ** 3 - 2 * mii * mjj * mjk ** 2 * mkk ** 2 + mij ** 2 * mjk ** 2 * mkk ** 2 + 2 * mij * mik * mjj * mjk * mkk ** 2 - mik ** 2 * mjj ** 2 * mkk ** 2 + mii * mjk ** 4 * mkk - 2 * mij * mik * mjk ** 3 * mkk + mik ** 2 * mjj * mjk ** 2 * mkk)

                    b = (
                        2 * li * mjj ** 2 * mkk ** 3 - 2 * lj * mij * mjj * mkk ** 3 - 4 * li * mjj * mjk ** 2 * mkk ** 2 + 2 * lj * mij * mjk ** 2 * mkk ** 2 + 2 * lj * mik * mjj * mjk * mkk ** 2 + 2 * lk * mij * mjj * mjk * mkk ** 2 - 2 * lk * mik * mjj ** 2 * mkk ** 2 + 2 * li * mjk ** 4 * mkk - 2 * lj * mik * mjk ** 3 * mkk - 2 * lk * mij * mjk ** 3 * mkk + 2 * lk * mik * mjj * mjk ** 2 * mkk)

                    delta = b * b - 4 * a * c

                    if a == 0 or delta < 0:
                        continue
                    sDelta = math.sqrt(delta)
                    imin = int(math.ceil((-b - sDelta) / (2 * a)))
                    imax = int(math.floor((-b + sDelta) / (2 * a)))
                    for i in range(imin, imax + 1):
                        cj = (
                            mkk * r + i ** 2 * mii * mkk + 2 * i * li * mkk - i ** 2 * mik ** 2 - 2 * i * lk * mik - lk ** 2)
                        aj = (mjj * mkk - mjk ** 2)
                        bj = (2 * i * mij * mkk + 2 * lj * mkk - 2 * i * mik * mjk - 2 * lk * mjk)
                        deltaj = bj * bj - 4 * aj * cj
                        if aj == 0 or deltaj < 0:
                            continue
                        sDeltaj = math.sqrt(deltaj)
                        jmin = int(math.ceil((-bj - sDeltaj) / (2 * aj)))
                        jmax = int(math.floor((-bj + sDeltaj) / (2 * aj)))
                        for j in range(jmin, jmax + 1):
                            ck = r + j ** 2 * mjj + 2 * i * j * mij + i ** 2 * mii + 2 * j * lj + 2 * i * li
                            ak = mkk
                            bk = (2 * j * mjk + 2 * i * mik + 2 * lk)
                            deltak = bk * bk - 4 * ak * ck
                            if ak == 0 or deltak < 0:
                                continue
                            sDeltak = math.sqrt(deltak)
                            kmin = int(math.ceil((-bk - sDeltak) / (2 * ak)))
                            kmax = int(math.floor((-bk + sDeltak) / (2 * ak)))
                            for k in range(kmin, kmax + 1):
                                if jj != 0 or i != 0 or j != 0 or k != 0:
                                    rr = r02 + k ** 2 * mkk + k * (
                                        2 * j * mjk + 2 * i * mik + 2 * lk) + j ** 2 * mjj + 2 * i * j * mij + i ** 2 * mii + 2 * j * lj + 2 * i * li
                                    arr.append(math.sqrt(rr))
            wFact = 4 * math.pi * len(structure) / abs(np.linalg.det(cell))
            for k, v in radD.items():
                v["arr"].sort()
                v["weights"] = map(lambda r: 1.0 / (wFact * r * r), v["arr"])

        # add results in ASE structure info
        if self.rdf_only:
            descriptor_data = dict(descriptor_name=self.name, descriptor_info=str(self), rdf=radD)
        else:
            descriptor_data = dict(descriptor_name=self.name, descriptor_info=str(self), prdf=radD)

        structure.info['descriptor'] = descriptor_data

        return structure

[docs]    def write(self, structure, tar, op_id=0, write_geo=True, format_geometry='aims'):
        """Write the descriptor to file.

        Parameters:

        structure: `ase.Atoms` object
            Atomic structure.

        tar: TarFile object
            TarFile archive where the descriptor is added. This is created internally with `tarfile.open`.

        op_id: int, optional (default=0)
            Number of the applied operation to the descriptor. At present always set to zero in the code.

        write_geo: bool, optional (default=`True`)
            If `True`, write a coordinate file of the structure for which the diffraction pattern is calculated.


        .. codeauthor:: Angelo Ziletti <angelo.ziletti@gmail.com>

        """

        if not is_descriptor_consistent(structure, self):
            raise Exception('Descriptor not consistent. Aborting.')

        desc_folder = self.configs['io']['desc_folder']

        if self.rdf_only:
            rdf = structure.info['descriptor']['rdf']

            rdf_filename_npy = os.path.abspath(os.path.normpath(
                os.path.join(desc_folder, structure.info['label'] + self.desc_metadata.ix['rdf']['file_ending'])))
            np.save(rdf_filename_npy, rdf)
            structure.info['rdf_filename_npy'] = rdf_filename_npy
            tar.add(structure.info['rdf_filename_npy'])

        else:
            prdf = structure.info['descriptor']['prdf']

            prdf_filename_npy = os.path.abspath(os.path.normpath(
                os.path.join(desc_folder, structure.info['label'] + self.desc_metadata.ix['prdf']['file_ending'])))
            np.save(prdf_filename_npy, prdf)
            structure.info['prdf_filename_npy'] = prdf_filename_npy
            tar.add(structure.info['prdf_filename_npy'])

        if write_geo:
            # to have the file accessible by the Beaker notebook image we need to put them
            # in a special folder ('/user/tmp')
            if self.configs['runtime']['isBeaker']:
                # only for Beaker Notebook
                coord_filename_in = os.path.abspath(os.path.normpath(os.path.join('/user/tmp/',
                                                                                  structure.info['label'] +
                                                                                  self.desc_metadata.ix[
                                                                                      'prdf_coordinates'][
                                                                                      'file_ending'])))
            else:
                coord_filename_in = os.path.abspath(os.path.normpath(os.path.join(desc_folder, structure.info['label'] +
                                                                                  self.desc_metadata.ix[
                                                                                      'prdf_coordinates'][
                                                                                      'file_ending'])))

            structure.write(coord_filename_in, format=format_geometry)
            structure.info['prdf_coord_filename_in'] = coord_filename_in
            tar.add(structure.info['prdf_coord_filename_in'])


[docs]def get_unique_chemical_species(structures):
    """Get the set of unique chemical species from a list of atomic structures.

    The list of structures must contain the calculated :py:class:`ai4materials.descriptors.prdf.PRDF`.

    Parameters:

    structures: ``ase.Atoms`` object or list of ``ase.Atoms`` objects
        Atomic structure or list of atomic structure.

    .. codeauthor:: Angelo Ziletti <angelo.ziletti@gmail.com>

    """

    atomic_nbs_set = set()
    for structure in structures:
        prdfs = structure.info['descriptor']['prdf']

        # find the set of unique number of chemical species in the partial radial distribution functions
        for (key, value) in viewitems(prdfs):
            atom_type_1 = value['particle_atom_number_1']
            atom_type_2 = value['particle_atom_number_2']
            atomic_nbs_set.add(atom_type_1)
            atomic_nbs_set.add(atom_type_2)

        unique_chem_species = [chemical_symbols[item] for item in atomic_nbs_set]

        max_rdf_length = 0
        for name, rdf in viewitems(prdfs):
            rdf_length = len(rdf['weights'])
            if max_rdf_length < rdf_length:
                max_rdf_length = rdf_length

            assert len(rdf['arr']) == len(rdf['weights']), "Wrong number of distances at c={}.".format(name)

    largest_atomic_nb = max(atomic_nbs_set) + 1

    logger.debug("Setting up dictionary of chemical species.")
    logger.debug("Longest rdf list: {0}".format(max_rdf_length))
    logger.debug("Number of different chemical species in the set: {0}".format(len(atomic_nbs_set)))
    logger.debug("Highest atomic number in the set: {0}".format(largest_atomic_nb))
    logger.info("Actual chemical species set: {0}".format(unique_chem_species))

    return largest_atomic_nb, unique_chem_species


[docs]def get_design_matrix(structures, total_bins=50, max_dist=25):
    """Starting from atomic structures calculate the design matrix for the partial radial distribution function.

    The list of structures must contain the calculated :py:class:`ai4materials.descriptors.prdf.PRDF`.
    The discretization is performed using a logarithmic grid as follows:
        bins = np.logspace(0, np.log10(max_dist), num=total_bins + 1) - 1

    Parameters:

    structures: ``ase.Atoms`` object or list of ``ase.Atoms`` object
        Atomic structure or list of atomic structure.

    total_bins: int, optional (default=50)
        Total number of bins to be used in the discretization of the partial radial distribution function.

    max_dist: float, optional (default=25)
        Maximum distance to consider in the partial radial distribution function when the design matrix is
        calculated. Unit in Angstrom.
        The unit of measure is the same as :py:class:`ai4materials.descriptors.prdf.PRDF`.

    Return:

    scipy.sparse.csr.csr_matrix, shape [n_samples, largest_atomic_nb * largest_atomic_nb * total_bins]
        Returns a sparse row-compressed matrix.

    .. codeauthor:: Angelo Ziletti <angelo.ziletti@gmail.com>

    """

    largest_atomic_nb, _ = get_unique_chemical_species(structures)

    design_matrix = scipy.sparse.lil_matrix((len(structures), largest_atomic_nb * largest_atomic_nb * total_bins))
    logger.debug("Feature matrix shape: {0}".format(design_matrix.shape))

    bins = np.logspace(0, np.log10(max_dist), num=total_bins + 1) - 1

    for idx_structure, structure in enumerate(structures):
        prdfs = structure.info['descriptor']['prdf']

        def chem_species_sorted(partial_rdfs):
            atom_nbs = [x.split("_") for x in list(partial_rdfs)]
            sorted_atom_nbs = sorted([(int(a), int(b)) for a, b in atom_nbs])
            return ["{0}_{1}".format(a, b) for a, b in sorted_atom_nbs]

        # indices must increase monotonically, other lil_matrix has bad performance
        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.lil_matrix.html
        for name in chem_species_sorted(prdfs):
            prdf = prdfs[name]
            i_atom = prdf['particle_atom_number_1']
            j_atom = prdf['particle_atom_number_2']

            if i_atom < j_atom:
                i_atom, j_atom = j_atom, i_atom

            arr = np.array(prdf['arr'], dtype=np.float64).reshape((-1,))
            weights = np.array(prdf['weights'], dtype=np.float64).reshape((-1,))

            binned, foo = np.histogram(arr, bins=bins, weights=weights, density=True)

            idx = (i_atom * largest_atomic_nb * total_bins +
                   j_atom * total_bins)

            for i in range(total_bins):
                design_matrix[idx_structure, idx] = binned[i]
                idx += 1

    design_matrix = design_matrix.asformat("csr")
    logger.debug("Sparse feature matrix needs {0} bytes".format(design_matrix.data.nbytes))
    logger.debug("Feature matrix shape: {0}".format(design_matrix.shape))

    return design_matrix