Source code for ai4materials.descriptors.prdf

# coding=utf-8
# Copyright 2016-2018 Angelo Ziletti
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

__author__ = "Angelo Ziletti"
__copyright__ = "Copyright 2018, Angelo Ziletti"
__maintainer__ = "Angelo Ziletti"
__email__ = "ziletti@fhi-berlin.mpg.de"
__date__ = "23/09/18"

from ase.data import chemical_symbols
from future.utils import viewitems
import logging
import math
from ai4materials.descriptors.base_descriptor import Descriptor
from ai4materials.descriptors.base_descriptor import is_descriptor_consistent
import numpy as np
import os
import scipy.sparse
logger = logging.getLogger('ai4materials')


[docs]class PRDF(Descriptor): """Compute the partial radial distribution of a given crystal structure. Cell vectors v1,v2,v3 with values in the columns: [[v1x,v2x,v3x],[v1y,v2y,v3x],[v1z,v2z,v3z]] Parameters: cutoff_radius: float, optional (default=20) Atoms within a sphere of cut-off radius (in Angstrom) are considered. rdf_only: bool, optional (defaults=`False`) If `False` calculates partial radial distribution function. If `True` calculates radial distribution function (all atom types are considered as the same) .. codeauthor:: Fawzi Mohamed <mohamed@fhi-berlin.mpg.de> and Angelo Ziletti <angelo.ziletti@gmail.com> """ def __init__(self, configs=None, cutoff_radius=20, rdf_only=False): super(PRDF, self).__init__(configs=configs) self.cutoff_radius = cutoff_radius self.rdf_only = rdf_only if not self.rdf_only: logger.info("Calculating partial radial distribution function.") else: logger.info("Calculating radial distribution function; all atomic species are considered as the same.")
[docs] def calculate(self, structure, **kwargs): """Calculate the descriptor for the given ASE structure. Parameters: structure: `ase.Atoms` object Atomic structure. .. codeauthor:: Fawzi Mohamed <mohamed@fhi-berlin.mpg.de> """ if not np.all(structure.pbc): raise NotImplementedError("At present, the partial radial distribution function is implemented only for " "three-dimensional periodic structures.") else: cell = structure.get_cell() cutoff_radius2 = self.cutoff_radius * self.cutoff_radius radD = {} for ii, a1 in enumerate(structure): for jj, a2 in enumerate(structure[ii:]): # write atomic numbers if prdf, otherwise set them to zero if not self.rdf_only: n1 = a1.number n2 = a2.number else: n1 = 0 n2 = 0 label = "%d_%d" % (min(n1, n2), max(n1, n2)) if label not in radD: radD[label] = {"particle_atom_number_1": n1, "particle_atom_number_2": n2, "arr": []} arr = radD[label]["arr"] r0 = np.array([a1.x - a2.x, a1.y - a2.y, a1.z - a2.z]) r02 = np.dot(r0, r0) m = np.dot(cell.transpose(), cell) l = np.dot(r0, cell) r = np.dot(r0, r0) - cutoff_radius2 mii = m[0, 0] mij = m[0, 1] mik = m[0, 2] mjj = m[1, 1] mjk = m[1, 2] mkk = m[2, 2] li = l[0] lj = l[1] lk = l[2] c = ( mjj ** 2 * mkk ** 3 * r - 2 * mjj * mjk ** 2 * mkk ** 2 * r + mjk ** 4 * mkk * r - lj ** 2 * mjj * mkk ** 3 + lj ** 2 * mjk ** 2 * mkk ** 2 + 2 * lj * lk * mjj * mjk * mkk ** 2 - lk ** 2 * mjj ** 2 * mkk ** 2 - 2 * lj * lk * mjk ** 3 * mkk + lk ** 2 * mjj * mjk ** 2 * mkk) a = ( mii * mjj ** 2 * mkk ** 3 - mij ** 2 * mjj * mkk ** 3 - 2 * mii * mjj * mjk ** 2 * mkk ** 2 + mij ** 2 * mjk ** 2 * mkk ** 2 + 2 * mij * mik * mjj * mjk * mkk ** 2 - mik ** 2 * mjj ** 2 * mkk ** 2 + mii * mjk ** 4 * mkk - 2 * mij * mik * mjk ** 3 * mkk + mik ** 2 * mjj * mjk ** 2 * mkk) b = ( 2 * li * mjj ** 2 * mkk ** 3 - 2 * lj * mij * mjj * mkk ** 3 - 4 * li * mjj * mjk ** 2 * mkk ** 2 + 2 * lj * mij * mjk ** 2 * mkk ** 2 + 2 * lj * mik * mjj * mjk * mkk ** 2 + 2 * lk * mij * mjj * mjk * mkk ** 2 - 2 * lk * mik * mjj ** 2 * mkk ** 2 + 2 * li * mjk ** 4 * mkk - 2 * lj * mik * mjk ** 3 * mkk - 2 * lk * mij * mjk ** 3 * mkk + 2 * lk * mik * mjj * mjk ** 2 * mkk) delta = b * b - 4 * a * c if a == 0 or delta < 0: continue sDelta = math.sqrt(delta) imin = int(math.ceil((-b - sDelta) / (2 * a))) imax = int(math.floor((-b + sDelta) / (2 * a))) for i in range(imin, imax + 1): cj = ( mkk * r + i ** 2 * mii * mkk + 2 * i * li * mkk - i ** 2 * mik ** 2 - 2 * i * lk * mik - lk ** 2) aj = (mjj * mkk - mjk ** 2) bj = (2 * i * mij * mkk + 2 * lj * mkk - 2 * i * mik * mjk - 2 * lk * mjk) deltaj = bj * bj - 4 * aj * cj if aj == 0 or deltaj < 0: continue sDeltaj = math.sqrt(deltaj) jmin = int(math.ceil((-bj - sDeltaj) / (2 * aj))) jmax = int(math.floor((-bj + sDeltaj) / (2 * aj))) for j in range(jmin, jmax + 1): ck = r + j ** 2 * mjj + 2 * i * j * mij + i ** 2 * mii + 2 * j * lj + 2 * i * li ak = mkk bk = (2 * j * mjk + 2 * i * mik + 2 * lk) deltak = bk * bk - 4 * ak * ck if ak == 0 or deltak < 0: continue sDeltak = math.sqrt(deltak) kmin = int(math.ceil((-bk - sDeltak) / (2 * ak))) kmax = int(math.floor((-bk + sDeltak) / (2 * ak))) for k in range(kmin, kmax + 1): if jj != 0 or i != 0 or j != 0 or k != 0: rr = r02 + k ** 2 * mkk + k * ( 2 * j * mjk + 2 * i * mik + 2 * lk) + j ** 2 * mjj + 2 * i * j * mij + i ** 2 * mii + 2 * j * lj + 2 * i * li arr.append(math.sqrt(rr)) wFact = 4 * math.pi * len(structure) / abs(np.linalg.det(cell)) for k, v in radD.items(): v["arr"].sort() v["weights"] = map(lambda r: 1.0 / (wFact * r * r), v["arr"]) # add results in ASE structure info if self.rdf_only: descriptor_data = dict(descriptor_name=self.name, descriptor_info=str(self), rdf=radD) else: descriptor_data = dict(descriptor_name=self.name, descriptor_info=str(self), prdf=radD) structure.info['descriptor'] = descriptor_data return structure
[docs] def write(self, structure, tar, op_id=0, write_geo=True, format_geometry='aims'): """Write the descriptor to file. Parameters: structure: `ase.Atoms` object Atomic structure. tar: TarFile object TarFile archive where the descriptor is added. This is created internally with `tarfile.open`. op_id: int, optional (default=0) Number of the applied operation to the descriptor. At present always set to zero in the code. write_geo: bool, optional (default=`True`) If `True`, write a coordinate file of the structure for which the diffraction pattern is calculated. .. codeauthor:: Angelo Ziletti <angelo.ziletti@gmail.com> """ if not is_descriptor_consistent(structure, self): raise Exception('Descriptor not consistent. Aborting.') desc_folder = self.configs['io']['desc_folder'] if self.rdf_only: rdf = structure.info['descriptor']['rdf'] rdf_filename_npy = os.path.abspath(os.path.normpath( os.path.join(desc_folder, structure.info['label'] + self.desc_metadata.ix['rdf']['file_ending']))) np.save(rdf_filename_npy, rdf) structure.info['rdf_filename_npy'] = rdf_filename_npy tar.add(structure.info['rdf_filename_npy']) else: prdf = structure.info['descriptor']['prdf'] prdf_filename_npy = os.path.abspath(os.path.normpath( os.path.join(desc_folder, structure.info['label'] + self.desc_metadata.ix['prdf']['file_ending']))) np.save(prdf_filename_npy, prdf) structure.info['prdf_filename_npy'] = prdf_filename_npy tar.add(structure.info['prdf_filename_npy']) if write_geo: # to have the file accessible by the Beaker notebook image we need to put them # in a special folder ('/user/tmp') if self.configs['runtime']['isBeaker']: # only for Beaker Notebook coord_filename_in = os.path.abspath(os.path.normpath(os.path.join('/user/tmp/', structure.info['label'] + self.desc_metadata.ix[ 'prdf_coordinates'][ 'file_ending']))) else: coord_filename_in = os.path.abspath(os.path.normpath(os.path.join(desc_folder, structure.info['label'] + self.desc_metadata.ix[ 'prdf_coordinates'][ 'file_ending']))) structure.write(coord_filename_in, format=format_geometry) structure.info['prdf_coord_filename_in'] = coord_filename_in tar.add(structure.info['prdf_coord_filename_in'])
[docs]def get_unique_chemical_species(structures): """Get the set of unique chemical species from a list of atomic structures. The list of structures must contain the calculated :py:class:`ai4materials.descriptors.prdf.PRDF`. Parameters: structures: ``ase.Atoms`` object or list of ``ase.Atoms`` objects Atomic structure or list of atomic structure. .. codeauthor:: Angelo Ziletti <angelo.ziletti@gmail.com> """ atomic_nbs_set = set() for structure in structures: prdfs = structure.info['descriptor']['prdf'] # find the set of unique number of chemical species in the partial radial distribution functions for (key, value) in viewitems(prdfs): atom_type_1 = value['particle_atom_number_1'] atom_type_2 = value['particle_atom_number_2'] atomic_nbs_set.add(atom_type_1) atomic_nbs_set.add(atom_type_2) unique_chem_species = [chemical_symbols[item] for item in atomic_nbs_set] max_rdf_length = 0 for name, rdf in viewitems(prdfs): rdf_length = len(rdf['weights']) if max_rdf_length < rdf_length: max_rdf_length = rdf_length assert len(rdf['arr']) == len(rdf['weights']), "Wrong number of distances at c={}.".format(name) largest_atomic_nb = max(atomic_nbs_set) + 1 logger.debug("Setting up dictionary of chemical species.") logger.debug("Longest rdf list: {0}".format(max_rdf_length)) logger.debug("Number of different chemical species in the set: {0}".format(len(atomic_nbs_set))) logger.debug("Highest atomic number in the set: {0}".format(largest_atomic_nb)) logger.info("Actual chemical species set: {0}".format(unique_chem_species)) return largest_atomic_nb, unique_chem_species
[docs]def get_design_matrix(structures, total_bins=50, max_dist=25): """Starting from atomic structures calculate the design matrix for the partial radial distribution function. The list of structures must contain the calculated :py:class:`ai4materials.descriptors.prdf.PRDF`. The discretization is performed using a logarithmic grid as follows: bins = np.logspace(0, np.log10(max_dist), num=total_bins + 1) - 1 Parameters: structures: ``ase.Atoms`` object or list of ``ase.Atoms`` object Atomic structure or list of atomic structure. total_bins: int, optional (default=50) Total number of bins to be used in the discretization of the partial radial distribution function. max_dist: float, optional (default=25) Maximum distance to consider in the partial radial distribution function when the design matrix is calculated. Unit in Angstrom. The unit of measure is the same as :py:class:`ai4materials.descriptors.prdf.PRDF`. Return: scipy.sparse.csr.csr_matrix, shape [n_samples, largest_atomic_nb * largest_atomic_nb * total_bins] Returns a sparse row-compressed matrix. .. codeauthor:: Angelo Ziletti <angelo.ziletti@gmail.com> """ largest_atomic_nb, _ = get_unique_chemical_species(structures) design_matrix = scipy.sparse.lil_matrix((len(structures), largest_atomic_nb * largest_atomic_nb * total_bins)) logger.debug("Feature matrix shape: {0}".format(design_matrix.shape)) bins = np.logspace(0, np.log10(max_dist), num=total_bins + 1) - 1 for idx_structure, structure in enumerate(structures): prdfs = structure.info['descriptor']['prdf'] def chem_species_sorted(partial_rdfs): atom_nbs = [x.split("_") for x in list(partial_rdfs)] sorted_atom_nbs = sorted([(int(a), int(b)) for a, b in atom_nbs]) return ["{0}_{1}".format(a, b) for a, b in sorted_atom_nbs] # indices must increase monotonically, other lil_matrix has bad performance # https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.lil_matrix.html for name in chem_species_sorted(prdfs): prdf = prdfs[name] i_atom = prdf['particle_atom_number_1'] j_atom = prdf['particle_atom_number_2'] if i_atom < j_atom: i_atom, j_atom = j_atom, i_atom arr = np.array(prdf['arr'], dtype=np.float64).reshape((-1,)) weights = np.array(prdf['weights'], dtype=np.float64).reshape((-1,)) binned, foo = np.histogram(arr, bins=bins, weights=weights, density=True) idx = (i_atom * largest_atomic_nb * total_bins + j_atom * total_bins) for i in range(total_bins): design_matrix[idx_structure, idx] = binned[i] idx += 1 design_matrix = design_matrix.asformat("csr") logger.debug("Sparse feature matrix needs {0} bytes".format(design_matrix.data.nbytes)) logger.debug("Feature matrix shape: {0}".format(design_matrix.shape)) return design_matrix