Source code for ai4materials.utils.utils_binaries

from ase.data import chemical_symbols
from ase.spacegroup import get_spacegroup
import logging
import numpy as np
import pandas as pd
from itertools import permutations
logger = logging.getLogger('ai4materials')


[docs]def get_target_diff_dic(df, sample_key=None, energy=None, spacegroup=None): """ Get a dictionary of dictionaries: samples -> space group tuples -> energy differences. Dropping all rows which do not correspond to the minimum energy per sample AND space group, then making a new data frame with space groups as columns. Finally constructing the dictionary of dictionaries. Parameters: df: pandas data frame with columns=[samples_title, energies_title, SG_title] sample_key: string Needs to be column title of samples of input df energy: string Needs to be column title of energies of input df spacegroup : string Needs to be column title of space groups of input df Returns: dic_out: dictionary of dictionaries: In the form: { sample_a: { (SG_1,SG_2):E_diff_a12, (SG_1,SG_3):E_diff_a13,...}, sample_b: { (SG_1,SG_2):E_diff_b12, (SG_1,SG_3):E_diff_b13,... }, ... } E_diff_a12 = energy_SG_1 - energy_SG_2 of sample a. Both (SG_1,SG_2) and (SG_2,SG_1) are considered. If SG_1 or SG_2 is NaN, energy difference to it is ignored. """ # use only rows with minimum energies idx = df.groupby([sample_key, spacegroup])[energy].transform(min) == df[energy] df = df[idx] df = df.drop_duplicates() # make new table with the different supgroups as columns df = df.pivot_table(energy, [sample_key], spacegroup) # make dictionary of dictionaries SG_list = df.columns.values Samples_list = df.index.values matrix = np.array(df) dic_out = dict.fromkeys(Samples_list) for i, sample in enumerate(Samples_list): row = matrix[i] not_nan_indices = np.argwhere(~np.isnan(row)).flatten() sample_dic = {} for j_1, j_2 in permutations(not_nan_indices, 2): SG_1, SG_2 = SG_list[j_1], SG_list[j_2] Energy_diff = row[j_1] - row[j_2] sample_dic.update({(SG_1, SG_2): Energy_diff}) if sample_dic: dic_out[sample] = sample_dic return dic_out
[docs]def select_diff_from_dic(dic, spacegroup_tuples, sample_key='Mat', drop_nan=None): """ Get data frame of selected spacegroup_tuples from dictionary of dictionaries. Creating a pandas data frame with columns of samples and selected space group tuples (energy differnces). Parameters: dic: dict {samples -> space group tuples -> energy differences.} spacegroup_tuples: tuple, list of tuples, tuples of tuples Each tuple has to contain two space groups numbers, to be looked up in the input dic. sample_key: string Will be the column title of the samples of the created data frame drop_nan: string, optional {'rows', 'SG_tuples'} Drops all rows or columns (SG_tuples) containing NaN. """ if isinstance(spacegroup_tuples, tuple) and all(isinstance(item, (float, int)) for item in spacegroup_tuples): spacegroup_tuples = [spacegroup_tuples] df_out = pd.DataFrame(dic, index=spacegroup_tuples).T if not drop_nan is None: if drop_nan == 'rows': df_out.dropna(axis=0, inplace=True) elif drop_nan == 'SG_tuples': df_out.dropna(axis=1, inplace=True) else: raise ValueError("Argument 'drop_nan' has to be 'None', 'rows' or 'SG_tuples'.") # check if df_out is empty len_columns = len(df_out.columns) len_rows = len(df_out.index) if len_columns == 0 or len_rows == 0: if len_rows == 0: string = 'rows' else: string = 'spacegroup_tuples' logger.error('Dropping {0} with NaNs leads to empty data frame.'.format(string)) logger.error('Hint: Select different spacegroup_tuples or set drop_nan=None') sys.exit(1) df_out.reset_index(inplace=True) df_out.rename(columns={'index': sample_key}, inplace=True) return df_out
[docs]def get_chemical_formula_binaries(atoms): numbers = atoms.get_atomic_numbers() elements = np.unique(numbers) symbols = np.array([chemical_symbols[e] for e in elements]) ind = symbols.argsort() symbols = symbols[ind] if 'H' in symbols: i = np.arange(len(symbols))[symbols == 'H'] symbols = np.insert(np.delete(symbols, i), 0, symbols[i]) if 'C' in symbols: i = np.arange(len(symbols))[symbols == 'C'] symbols = np.insert(np.delete(symbols, i), 0, symbols[i]) formula = "".join(symbols) if len(symbols) == 1: formula += '2' return formula
[docs]def get_binaries_dict_delta_e(chemical_formula_list, energy_list, label_list, equiv_spgroups): energy_list = [item * 0.5 for item in energy_list] # make dataframe with chemical formula, energy, and labels data = zip(chemical_formula_list, energy_list, label_list) df_energy_diff = pd.DataFrame.from_records(data, columns=['chemical_formula', 'energy_total', 'spacegroup']) sample_key, energy, spacegroup = 'chemical_formula', 'energy_total', 'spacegroup' drop_nan = None # or 'rows' or 'SG_tuples' selected_spacegroup_tuples, spacegroups_replace = zip(*equiv_spgroups) # replace space groups such that only one space group per structure is present df_energy_diff[spacegroup] = df_energy_diff[spacegroup].replace(spacegroups_replace, selected_spacegroup_tuples) target_diff_dic = get_target_diff_dic( df_energy_diff, sample_key=sample_key, energy=energy, spacegroup=spacegroup) target_df = select_diff_from_dic( target_diff_dic, selected_spacegroup_tuples, sample_key=sample_key, drop_nan=drop_nan) df_with_e_diff = df_energy_diff.merge(target_df, left_on=sample_key, right_on=sample_key) dict_delta_e = df_with_e_diff.set_index(sample_key)[selected_spacegroup_tuples].to_dict() return dict_delta_e
[docs]def get_energy_diff_by_spacegroup(ase_atoms_list, target='energy_total', equiv_spgroups=None): logging.debug("Using {} as target.".format(target)) chemical_formula_list = [] energy_list = [] label_list = [] for idx_atoms, ase_atoms in enumerate(ase_atoms_list): energy = ase_atoms.info[target] # get chemical_formula, energy, classification (space group) for binaires label = get_spacegroup(ase_atoms).no # chemical_formula = list(set(ase_atoms.get_chemical_symbols())) chemical_formula = ase_atoms.get_chemical_formula(mode='hill') chemical_formula_list.append(chemical_formula) label_list.append(label) energy_list.append(energy) # the last iteration calculate the energy differences between space group with all the json_file data dict_delta_e = get_binaries_dict_delta_e( chemical_formula_list, energy_list, label_list, equiv_spgroups=equiv_spgroups) return dict_delta_e