Source code for ai4materials.models.l1_l0

# coding=utf-8
# Copyright 2016-2018 Angelo Ziletti
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

__author__ = "Angelo Ziletti"
__copyright__ = "Copyright 2018, Angelo Ziletti"
__maintainer__ = "Angelo Ziletti"
__email__ = "ziletti@fhi-berlin.mpg.de"
__date__ = "23/09/18"

import logging
import os
import json
import sys
import pandas as pd
import numpy as np
import random
import math
import itertools
import scipy.stats
from sklearn import linear_model
from math import exp, sqrt
import ai4materials.utils.unit_conversion as uc
logger = logging.getLogger('ai4materials')


[docs]def choose_atomic_features(selected_feature_list=None, atomic_data_file=None, binary_data_file=None): """Choose primary features for the extended lasso procedure.""" df1 = pd.read_csv(atomic_data_file, index_col=False) df2 = pd.read_csv(binary_data_file, index_col=False) # merge two dataframes on Material df = pd.merge(df1, df2, on='Mat') # calculate r_sigma and r_pi [Phys. Rev. Lett. 33, 1095(1974)] radii_s_p = ['rp(A)', 'rs(A)', 'rp(B)', 'rs(B)'] df['r_sigma'] = df[radii_s_p].apply(r_sigma, axis=1) df['r_pi'] = df[radii_s_p].apply(r_pi, axis=1) # calculate Es/sqrt(Zval) and Ep/sqrt(Zval) e_val_z = ['Es(A)', 'val(A)'] df['Es(A)/sqrt(Zval(A))'] = df[e_val_z].apply(e_sqrt_z, axis=1) e_val_z = ['Es(B)', 'val(B)'] df['Es(B)/sqrt(Zval(B))'] = df[e_val_z].apply(e_sqrt_z, axis=1) e_val_z = ['Ep(A)', 'val(A)'] df['Ep(A)/sqrt(Zval(A))'] = df[e_val_z].apply(e_sqrt_z, axis=1) e_val_z = ['Ep(B)', 'val(B)'] df['Ep(B)/sqrt(Zval(B))'] = df[e_val_z].apply(e_sqrt_z, axis=1) column_list = df.columns.tolist() feature_list = column_list if 'Mat' in feature_list: feature_list.remove('Mat') if 'Edim' in feature_list: feature_list.remove('Edim') logger.debug("Available features: \n {}".format(feature_list)) df_selected = df[selected_feature_list] df_selected.insert(0, 'Mat', df['Mat']) if selected_feature_list: logger.info("Primary features selected: \n {}".format(selected_feature_list)) else: logger.error("No selected features.") sys.exit(1) return df_selected
[docs]def classify_rs_zb(structure): """Classify if a structure is rocksalt of zincblend from a list of NoMaD structure. (one json file). Supports multiple frames (TO DO: check that). Hard-coded. rocksalt: atom_frac1 0.0 0.0 0.0 atom_frac2 0.5 0.5 0.5 zincblende: atom_frac1 0.0 0.0 0.0 atom_frac2 0.25 0.25 0.25 zincblende --> label=0 rocksalt --> label=1 """ energy = {} chemical_formula = {} label = {} # gIndexRun=0 # gIndexDesc=1 for (gIndexRun, gIndexDesc), atoms in structure.atoms.iteritems(): if atoms is not None: energy[gIndexRun, gIndexDesc] = structure.energy_eV[(gIndexRun, gIndexDesc)] # energy=1.0 chemical_formula[gIndexRun, gIndexDesc] = structure.chemical_formula[(gIndexRun, gIndexDesc)] # get labels, works only for RS/ZB dataset pos_atom_2 = np.asarray(list(structure.scaled_positions.values())).reshape(2, 3)[1, :] if all(i < 0.375 for i in pos_atom_2): # label='zincblend' label[gIndexRun, gIndexDesc] = 0 else: # label='rocksalt' label[gIndexRun, gIndexDesc] = 1 break return chemical_formula, energy, label
[docs]def get_energy_diff(chemical_formula_list, energy_list, label_list): """ Obtain difference in energy (eV) between rocksalt and zincblend structures of a given binary. From a list of chemical formulas, energies and labels returns a dictionary with {`material`: `delta_e`} where `delta_e` is the difference between the energy with label 1 and energy with label 0, grouped by material. Each element of such list corresponds to a json file. The `delta_e` is exactly what reported in the PRL 114, 105503(2015). .. todo:: Check if it works for multiple frames. """ energy_ = [] chemical_formula_ = [] label_ = [] # energy and chemical formula are lists even if only one frame is present for i, energy_i in enumerate(energy_list): energy_.append(energy_i.values()) for i, chemical_formula_i in enumerate(chemical_formula_list): chemical_formula_.append(chemical_formula_i.values()) for i, label_i in enumerate(label_list): label_.append(label_i.values()) # flatten the lists energy = list(itertools.chain(*energy_)) chemical_formula = list(itertools.chain(*chemical_formula_)) label = list(itertools.chain(*label_)) df = pd.DataFrame() df['Mat'] = chemical_formula df['Energy'] = energy df['Label'] = label # generate summary dataframe with lowest zincblend and rocksalt energy # zincblend --> label=0 # rocksalt --> label=1 df_summary = df.sort_values(by='Energy').groupby(['Mat', 'Label'], as_index=False).first() groupby_mat = df_summary.groupby('Mat') dict_delta_e = {} for mat, df in groupby_mat: # calculate the delta_e (E_RS - E_ZB) energy_label_1 = df.loc[df['Label'] == 1].Energy.values energy_label_0 = df.loc[df['Label'] == 0].Energy.values # if energy_diff>0 --> rs # if energy_diff<0 --> zb if (energy_label_0 and energy_label_1): # single element numpy array --> convert to scalar energy_diff = (energy_label_1 - energy_label_0).item(0) # divide by 2 because it is the energy_diff for each atom energy_diff = energy_diff / 2.0 else: logger.error( "Could not find all the energies needed to calculate required property for material '{0}'".format(mat)) sys.exit(1) dict_delta_e.update({mat: (energy_diff, energy_label_0, energy_label_1)}) return dict_delta_e
[docs]def get_lowest_energy_structures(structure, dict_delta_e): """Get lowest energy structure for each material and label type. Works only with two possible labels for a given material. .. todo:: Check if it works for multiple frames. """ energy = {} chemical_formula = {} is_lowest_energy = {} for (gIndexRun, gIndexDesc), atoms in structure.atoms.items(): if atoms is not None: energy[gIndexRun, gIndexDesc] = structure.energy_eV[gIndexRun, gIndexDesc] chemical_formula[gIndexRun, gIndexDesc] = structure.chemical_formula[gIndexRun, gIndexDesc] lowest_energy_label_0 = dict_delta_e.get(chemical_formula[gIndexRun, gIndexDesc])[1] lowest_energy_label_1 = dict_delta_e.get(chemical_formula[gIndexRun, gIndexDesc])[2] if lowest_energy_label_0 > lowest_energy_label_1: lowest_energy_label_01 = lowest_energy_label_1 else: lowest_energy_label_01 = lowest_energy_label_0 if energy[gIndexRun, gIndexDesc] == lowest_energy_label_01: is_lowest_energy[gIndexRun, gIndexDesc] = True else: is_lowest_energy[gIndexRun, gIndexDesc] = False return is_lowest_energy
[docs]def write_atomic_features(structure, selected_feature_list, df, dict_delta_e=None, path=None, filename_suffix='.json', json_file=None): """Given the chemical composition, build the descriptor made of atomic features only. Includes all the frames in the same json file. .. todo:: Check if it works for multiple frames. """ # make dictionary {primary_feature: value} for each structure # dictionary of a dictionary, key: Mat, value: atomic_features dict_features = df.set_index('chemical_formula').T.to_dict() # label=0: rocksalt, label=1: zincblend #chemical_formula_, energy_, label_ = classify_rs_zb(structure) #is_lowest_energy_ = get_lowest_energy_structures(structure, dict_delta_e) if structure.isPeriodic == True: for (gIndexRun, gIndexDesc), atoms in structure.atoms.items(): if atoms is not None: # filename is the normalized absolute path filename = os.path.abspath(os.path.normpath(os.path.join(path, '{0}{1}'.format(structure.name, filename_suffix)))) outF = file(filename, 'w') outF.write(""" { "data":[""") cell = structure.atoms[gIndexRun, gIndexDesc].get_cell() cell = np.transpose(cell) atoms = structure.atoms[gIndexRun, gIndexDesc] chemical_formula = structure.chemical_formula_[gIndexRun, gIndexDesc] energy = structure.energy_eV[gIndexRun, gIndexDesc] label = label_[gIndexRun, gIndexDesc] #target = dict_delta_e.get(chemical_formula_[gIndexRun, gIndexDesc])[0] target = dict_delta_e.get(chemical_formula) atomic_features = dict_features[structure.chemical_formula[gIndexRun, gIndexDesc]] #is_lowest_energy = is_lowest_energy_[gIndexRun,gIndexDesc] res = { "checksum": structure.name, "label": label, "energy": energy, #"is_lowest_energy": is_lowest_energy, "delta_e_rs_zb": target, "chemical_formula": chemical_formula, "gIndexRun": gIndexRun, "gIndexDesc": gIndexDesc, "cell": cell.tolist(), "particle_atom_number": map(lambda x: x.number, atoms), "particle_position": map(lambda x: [x.x, x.y, x.z], atoms), "atomic_features": atomic_features, "main_json_file_name": json_file, } json.dump(res, outF, indent=2) outF.write(""" ] }""") outF.flush() return filename
[docs]def r_sigma(row): """Calculates r_sigma. John-Bloch's indicator1: |rp(A) + rs(A) - rp(B) -rs(B)| from Phys. Rev. Lett. 33, 1095 (1974). Input rp(A), rs(A), rp(B), rs(B) They need to be given in this order. """ return abs(row[0] + row[1] - row[2] + row[3])
[docs]def r_pi(row): """Calculates r_pi. John-Bloch's indicator2: |rp(A) - rs(A)| +| rp(B) -rs(B)| from Phys. Rev. Lett. 33, 1095 (1974). Input rp(A), rs(A), rp(B), rs(B) They need to be given in this order. combine_features """ return abs(row[0] - row[1]) + abs(row[2] - row[3])
[docs]def e_sqrt_z(row): """Calculates e/sqrt(val_Z). Es/sqrt(Zval) and Ep/sqrt(Zval) from Phys. Rev. B 85, 104104 (2012). Input Es(A) or Ep(A), val(A) (A-->B) They need to be given in this order. """ return row[0] / math.sqrt(row[1])
def _get_scaling_factors(columns, metadata_info, energy_unit, length_unit): """Calculates characteristic energy and length, given an atomic metadata""" scaling_factor = [] if columns is not None: for col in columns: try: col_unit = metadata_info[col.split('(', 1)[0]]['units'] # check allowed values, to avoid problem with substance - NOT IDEAD if col_unit == 'J': scaling_factor.append(uc.convert_unit(1, energy_unit, target_unit='eV')) # divide all column by e_0 #df.loc[:, col] *= e_0 elif col_unit == 'm': scaling_factor.append(uc.convert_unit(1, length_unit, target_unit='angstrom')) # divide all column by e_0 #df.loc[:, col] *= d_0 else: scaling_factor.append(1.0) logger.debug("Feature units are not energy nor lengths. " "No scale to characteristic length.") except BaseException: scaling_factor.append(1.0) logger.debug("Feature units not included in metadata") return scaling_factor def _my_power_2(row): return pow(row[0], 2) def _my_power_3(row): return pow(row[0], 3) def _my_power_m1(row): return pow(row[0], -1) def _my_power_m2(row): return pow(row[0], -2) def _my_power_m3(row): return pow(row[0], -3) def _my_abs_sqrt(row): return math.sqrtabs(abs(row[0])) def _my_exp(row): return exp(row[0]) def _my_exp_power_2(row): return exp(pow(row[0], 2)) def _my_exp_power_3(row): return exp(pow(row[0], 3)) def _my_sum(row): return row[0] + row[1] def _my_abs_sum(row): return abs(row[0] + row[1]) def _my_abs_diff(row): return abs(row[0] - row[1]) def _my_diff(row): return row[0] - row[1] def _my_div(row): return row[0] / row[1] def _my_sum_power_2(row): return pow((row[0] + row[1]), 2) def _my_sum_power_3(row): return pow((row[0] + row[1]), 3) def _my_sum_exp(row): return exp(row[0] + row[1]) def _my_sum_exp_power_2(row): return exp(pow(row[0] + row[1], 2)) def _my_sum_exp_power_3(row): return exp(pow(row[0] + row[1], 3))
[docs]def combine_features(df=None, energy_unit=None, length_unit=None, metadata_info=None, allowed_operations=None, derived_features=None): """Generate combination of features given a dataframe and a list of allowed operations. For the exponentials, we introduce a characteristic energy/length converting the ..todo:: Fix under/overflow errors, and introduce handling of exceptions. """ if allowed_operations: logger.info('Selected operations:\n {0}'.format(allowed_operations)) else: logger.warning('No allowed operations selected.') # make derived features if derived_features is not None: if 'r_sigma' in derived_features: # calculate r_sigma and r_pi [Phys. Rev. Lett. 33, 1095(1974)] logger.info('Including rs and rp to allow r_sigma calculation') radii_s_p = ['atomic_rp_max(A)', 'atomic_rs_max(A)', 'atomic_rp_max(B)', 'atomic_rs_max(B)'] df['r_sigma'] = df[radii_s_p].apply(r_sigma, axis=1) if 'r_pi' in derived_features: logger.info('Including rs and rp to allow r_pi calculation') radii_s_p = ['atomic_rp_max(A)', 'atomic_rs_max(A)', 'atomic_rp_max(B)', 'atomic_rs_max(B)'] df['r_pi'] = df[radii_s_p].apply(r_pi, axis=1) # calculate Es/sqrt(Zval) and Ep/sqrt(Zval) # e_val_z = ['Es(A)', 'val(A)'] # df['Es(A)/sqrt(Zval(A))'] = df[e_val_z].apply(e_sqrt_z, axis=1) # e_val_z = ['Es(B)', 'val(B)'] # df['Es(B)/sqrt(Zval(B))'] = df[e_val_z].apply(e_sqrt_z, axis=1) # # e_val_z = ['Ep(A)', 'val(A)'] # df['Ep(A)/sqrt(Zval(A))'] = df[e_val_z].apply(e_sqrt_z, axis=1) # e_val_z = ['Ep(B)', 'val(B)'] # df['Ep(B)/sqrt(Zval(B))'] = df[e_val_z].apply(e_sqrt_z, axis=1) columns_ = df.columns.tolist() # define subclasses of features (see Phys. Rev. Lett. 114, 105503(2015) Supp. info. pag.1) # make a dictionary {feature: subgroup} # features belonging to a0 will not be combined, just added at the end # dict_features = { # u'val(B)': 'a0', u'val(A)': 'a0', # # u'period__el0':'a0', # u'period__el1':'a0', # u'atomic_number__el0': 'a0', # u'atomic_number__el1': 'a0', # u'group__el0': 'a0', # u'group__el1': 'a0', # # u'atomic_ionization_potential__el0': 'a1', # u'atomic_ionization_potential__el1': 'a1', # u'atomic_electron_affinity__el0': 'a1', # u'atomic_electron_affinity__el1': 'a1', # u'atomic_homo_lumo_diff__el0': 'a1', # u'atomic_homo_lumo_diff__el1': 'a1', # u'atomic_electronic_binding_energy_el0': 'a1', # u'atomic_electronic_binding_energy_el1': 'a1', # # # u'HOMO(A)': 'a2', u'LUMO(A)': 'a2', u'HOMO(B)': 'a2', u'LUMO(B)': 'a2', # u'HL_gap_AB': 'a2', # u'Ebinding_AB': 'a2', # # u'atomic_rs_max__el0': 'a3', # u'atomic_rs_max__el1': 'a3', # u'atomic_rp_max__el0': 'a3', # u'atomic_rp_max__el1': 'a3', # u'atomic_rd_max__el0': 'a3', # u'atomic_rd_max__el1': 'a3', # u'atomic_r_by_2_dimer__el0': 'a3', # u'atomic_r_by_2_dimer__el1': 'a3', # # u'd_AB': 'a3', # u'r_sigma': 'a3', u'r_pi': 'a3', # # u'Eh': 'a4', u'C': 'a4' # } dict_features = { u'period': 'a0', u'atomic_number': 'a0', u'group': 'a0', u'atomic_ionization_potential': 'a1', u'atomic_electron_affinity': 'a1', u'atomic_homo_lumo_diff': 'a1', u'atomic_electronic_binding_energy': 'a1', u'atomic_homo': 'a2', u'atomic_lumo': 'a2', u'atomic_rs_max': 'a3', u'atomic_rp_max': 'a3', u'atomic_rd_max': 'a3', u'atomic_r_by_2_dimer': 'a3', u'r_sigma': 'a3', u'r_pi': 'a3' } # standardize the data - # we cannot reproduce the PRL if we standardize the data #df_a0 = (df_a0 - df_a0.mean()) / (df_a0.max() - df_a0.min()) #df_a1 = (df_a1 - df_a1.mean()) / (df_a1.max() - df_a1.min()) #df_a2 = (df_a2 - df_a2.mean()) / (df_a2.max() - df_a2.min()) #df_a3 = (df_a3 - df_a3.mean()) / (df_a3.max() - df_a3.min()) #df_a4 = (df_a4 - df_a4.mean()) / (df_a4.max() - df_a4.min()) # df_a0 = df[[col for col in columns_ if dict_features.get(col)=='a0']].astype('float32') df_a0 = df[[col for col in columns_ if dict_features.get(col.split('(', 1)[0]) == 'a0']].astype('float32') df_a1 = df[[col for col in columns_ if dict_features.get(col.split('(', 1)[0]) == 'a1']].astype('float32') df_a2 = df[[col for col in columns_ if dict_features.get(col.split('(', 1)[0]) == 'a2']].astype('float32') df_a3 = df[[col for col in columns_ if dict_features.get(col.split('(', 1)[0]) == 'a3']].astype('float32') df_a4 = df[[col for col in columns_ if dict_features.get(col.split('(', 1)[0]) == 'a4']].astype('float32') col_a0 = df_a0.columns.tolist() col_a1 = df_a1.columns.tolist() col_a2 = df_a2.columns.tolist() col_a3 = df_a3.columns.tolist() col_a4 = df_a4.columns.tolist() # this list will at the end all the dataframes created df_list = [] df_b0_list = [] df_b1_list = [] df_b2_list = [] df_b3_list = [] df_c3_list = [] df_d3_list = [] df_e3_list = [] df_f1_list = [] df_f2_list = [] df_f3_list = [] df_x1_list = [] df_x2_list = [] df_x_list = [] # create b0: absolute differences and sums of a0 # this is not in the PRL. for subset in itertools.combinations(col_a0, 2): if '+' in allowed_operations: cols = ['(' + subset[0] + '+' + subset[1] + ')'] data = df_a0[list(subset)].apply(_my_sum, axis=1) df_b0_list.append(pd.DataFrame(data, columns=cols)) if '-' in allowed_operations: cols = ['(' + subset[0] + '-' + subset[1] + ')'] data = df_a0[list(subset)].apply(_my_diff, axis=1) df_b0_list.append(pd.DataFrame(data, columns=cols)) cols = ['(' + subset[1] + '-' + subset[0] + ')'] data = df_a0[list(subset)].apply(_my_diff, axis=1) df_b0_list.append(pd.DataFrame(data, columns=cols)) if '|+|' in allowed_operations: cols = ['|' + subset[0] + '+' + subset[1] + '|'] data = df_a0[list(subset)].apply(_my_abs_sum, axis=1) df_b0_list.append(pd.DataFrame(data, columns=cols)) if '|-|' in allowed_operations: cols = ['|' + subset[0] + '-' + subset[1] + '|'] data = df_a0[list(subset)].apply(_my_abs_diff, axis=1) df_b0_list.append(pd.DataFrame(data, columns=cols)) if '/' in allowed_operations: cols = [subset[0] + '/' + subset[1]] data = df_a0[list(subset)].apply(_my_div, axis=1) df_b0_list.append(pd.DataFrame(data, columns=cols)) cols = [subset[1] + '/' + subset[0]] data = df_a0[list(subset)].apply(_my_div, axis=1) df_b0_list.append(pd.DataFrame(data, columns=cols)) # we kept itertools.combinations to make the code more uniform with the binary operations for subset in itertools.combinations(col_a0, 1): if '^2' in allowed_operations: cols = [subset[0] + '^2'] data = df_a0[list(subset)].apply(_my_power_2, axis=1) df_b0_list.append(pd.DataFrame(data, columns=cols)) if '^3' in allowed_operations: cols = [subset[0] + '^3'] data = df_a0[list(subset)].apply(_my_power_3, axis=1) df_b0_list.append(pd.DataFrame(data, columns=cols)) if 'exp' in allowed_operations: cols = ['exp(' + subset[0] + ')'] data = df_a0[list(subset)].apply(_my_exp, axis=1) df_b0_list.append(pd.DataFrame(data, columns=cols)) # create b1: absolute differences and sums of a1 for subset in itertools.combinations(col_a1, 2): if '+' in allowed_operations: cols = ['(' + subset[0] + '+' + subset[1] + ')'] data = df_a1[list(subset)].apply(_my_sum, axis=1) df_b1_list.append(pd.DataFrame(data, columns=cols)) if '-' in allowed_operations: cols = ['(' + subset[0] + '-' + subset[1] + ')'] data = df_a1[list(subset)].apply(_my_diff, axis=1) df_b1_list.append(pd.DataFrame(data, columns=cols)) if '|+|' in allowed_operations: cols = ['|' + subset[0] + '+' + subset[1] + '|'] data = df_a1[list(subset)].apply(_my_abs_sum, axis=1) df_b1_list.append(pd.DataFrame(data, columns=cols)) if '|-|' in allowed_operations: cols = ['|' + subset[0] + '-' + subset[1] + '|'] data = df_a1[list(subset)].apply(_my_abs_diff, axis=1) df_b1_list.append(pd.DataFrame(data, columns=cols)) # create b2: absolute differences and sums of a2 for subset in itertools.combinations(col_a2, 2): if '+' in allowed_operations: cols = ['(' + subset[0] + '+' + subset[1] + ')'] data = df_a2[list(subset)].apply(_my_sum, axis=1) df_b2_list.append(pd.DataFrame(data, columns=cols)) if '-' in allowed_operations: cols = ['(' + subset[0] + '-' + subset[1] + ')'] data = df_a2[list(subset)].apply(_my_diff, axis=1) df_b2_list.append(pd.DataFrame(data, columns=cols)) if '|+|' in allowed_operations: cols = ['|' + subset[0] + '+' + subset[1] + '|'] data = df_a2[list(subset)].apply(_my_abs_sum, axis=1) df_b2_list.append(pd.DataFrame(data, columns=cols)) if '|-|' in allowed_operations: cols = ['|' + subset[0] + '-' + subset[1] + '|'] data = df_a2[list(subset)].apply(_my_abs_diff, axis=1) df_b2_list.append(pd.DataFrame(data, columns=cols)) # create b3: absolute differences and sums of a3 for subset in itertools.combinations(col_a3, 2): if '+' in allowed_operations: cols = ['(' + subset[0] + '+' + subset[1] + ')'] data = df_a3[list(subset)].apply(_my_sum, axis=1) df_b3_list.append(pd.DataFrame(data, columns=cols)) if '-' in allowed_operations: cols = ['(' + subset[0] + '-' + subset[1] + ')'] data = df_a3[list(subset)].apply(_my_diff, axis=1) df_b3_list.append(pd.DataFrame(data, columns=cols)) if '|+|' in allowed_operations: cols = ['|' + subset[0] + '+' + subset[1] + '|'] data = df_a3[list(subset)].apply(_my_abs_sum, axis=1) df_b3_list.append(pd.DataFrame(data, columns=cols)) if '|-|' in allowed_operations: cols = ['|' + subset[0] + '-' + subset[1] + '|'] data = df_a3[list(subset)].apply(_my_abs_diff, axis=1) df_b3_list.append(pd.DataFrame(data, columns=cols)) # create c3: two steps: # 1) squares of a3 - unary operations # we kept itertools.combinations to make the code more uniform with the binary operations for subset in itertools.combinations(col_a3, 1): if '^2' in allowed_operations: cols = [subset[0] + '^2'] data = df_a3[list(subset)].apply(_my_power_2, axis=1) df_c3_list.append(pd.DataFrame(data, columns=cols)) if '^3' in allowed_operations: cols = [subset[0] + '^3'] data = df_a3[list(subset)].apply(_my_power_3, axis=1) df_c3_list.append(pd.DataFrame(data, columns=cols)) # 2) squares of b3 (only sums) --> sum squared of a3 for subset in itertools.combinations(col_a3, 2): if '^2' in allowed_operations: cols = ['(' + subset[0] + '+' + subset[1] + ')^2'] data = df_a3[list(subset)].apply(_my_sum_power_2, axis=1) df_c3_list.append(pd.DataFrame(data, columns=cols)) if '^3' in allowed_operations: cols = ['(' + subset[0] + '+' + subset[1] + ')^3'] data = df_a3[list(subset)].apply(_my_sum_power_3, axis=1) df_c3_list.append(pd.DataFrame(data, columns=cols)) # create d3: two steps: # 1) exponentials of a3 - unary operations # we kept itertools.combinations to make the code more uniform with the binary operations for subset in itertools.combinations(col_a3, 1): if 'exp' in allowed_operations: cols = ['exp(' + subset[0] + ')'] # find scaling factor for e_0 or d_0 for scaling # and multiply each column by the scaling factor scaling_factors = _get_scaling_factors(list(subset), metadata_info, energy_unit, length_unit) df_subset = df_a3[list(subset)] * scaling_factors data = df_subset.apply(_my_exp, axis=1) df_d3_list.append(pd.DataFrame(data, columns=cols)) # 2) exponentials of b3 (only sums) --> exponential of sum of a3 for subset in itertools.combinations(col_a3, 2): if 'exp' in allowed_operations: cols = ['exp(' + subset[0] + '+' + subset[1] + ')'] # find scaling factor for e_0 or d_0 for scaling # and multiply each column by the scaling factor scaling_factors = _get_scaling_factors(list(subset), metadata_info, energy_unit, length_unit) df_subset = df_a3[list(subset)] * scaling_factors data = df_subset.apply(_my_sum_exp, axis=1) df_d3_list.append(pd.DataFrame(data, columns=cols)) # create e3: two steps: # 1) exponentials of squared a3 - unary operations # we kept itertools.combinations to make the code more uniform with the binary operations for subset in itertools.combinations(col_a3, 1): operations = {'exp', '^2'} if operations <= set(allowed_operations): cols = ['exp(' + subset[0] + '^2)'] # find scaling factor for e_0 or d_0 for scaling # and multiply each column by the scaling factor scaling_factors = _get_scaling_factors(list(subset), metadata_info, energy_unit, length_unit) df_subset = df_a3[list(subset)] * scaling_factors data = df_subset.apply(_my_exp_power_2, axis=1) df_e3_list.append(pd.DataFrame(data, columns=cols)) operations = {'exp', '^3'} if operations <= set(allowed_operations): try: cols = ['exp(' + subset[0] + '^3)'] # find scaling factor for e_0 or d_0 for scaling # and multiply each column by the scaling factor scaling_factors = _get_scaling_factors(list(subset), metadata_info, energy_unit, length_unit) df_subset = df_a3[list(subset)] * scaling_factors data = df_subset.apply(_my_exp_power_3, axis=1) df_e3_list.append(pd.DataFrame(data, columns=cols)) except OverflowError as e: logger.warning('Dropping feature combination that caused under/overflow.\n') # 2) exponentials of b3 (only sums) --> exponential of sum of a3 for subset in itertools.combinations(col_a3, 2): operations = {'exp', '^2'} if operations <= set(allowed_operations): cols = ['exp((' + subset[0] + '+' + subset[1] + ')^2)'] # find scaling factor for e_0 or d_0 for scaling # and multiply each column by the scaling factor scaling_factors = _get_scaling_factors(list(subset), metadata_info, energy_unit, length_unit) df_subset = df_a3[list(subset)] * scaling_factors data = df_subset.apply(_my_sum_exp_power_2, axis=1) df_e3_list.append(pd.DataFrame(data, columns=cols)) operations = {'exp', '^3'} if operations <= set(allowed_operations): try: cols = ['exp((' + subset[0] + '+' + subset[1] + ')^3)'] # find scaling factor for e_0 or d_0 for scaling # and multiply each column by the scaling factor scaling_factors = _get_scaling_factors(list(subset), metadata_info, energy_unit, length_unit) df_subset = df_a3[list(subset)] * scaling_factors data = df_subset.apply(_my_sum_exp_power_3, axis=1) df_e3_list.append(pd.DataFrame(data, columns=cols)) except OverflowError as e: logger.warning('Dropping feature combination that caused under/overflow.\n') # make dataframes from lists, check if they are not empty # we make there here because they are going to be used to further # combine the features if not df_a0.empty: df_list.append(df_a0) if not df_a1.empty: df_x1_list.append(df_a1) df_list.append(df_a1) if not df_a2.empty: df_x1_list.append(df_a2) df_list.append(df_a2) if not df_a3.empty: df_x1_list.append(df_a3) df_list.append(df_a3) if not df_a4.empty: df_list.append(df_a4) if df_b0_list: df_b0 = pd.concat(df_b0_list, axis=1) col_b0 = df_b0.columns.tolist() df_b0.to_csv('./df_b0.csv', index=True) df_list.append(df_b0) if df_b1_list: df_b1 = pd.concat(df_b1_list, axis=1) col_b1 = df_b1.columns.tolist() df_x1_list.append(df_b1) df_list.append(df_b1) if df_b2_list: df_b2 = pd.concat(df_b2_list, axis=1) col_b2 = df_b2.columns.tolist() df_x1_list.append(df_b2) df_list.append(df_b2) if df_b3_list: df_b3 = pd.concat(df_b3_list, axis=1) col_b3 = df_b3.columns.tolist() df_x1_list.append(df_b3) df_list.append(df_b3) if df_c3_list: df_c3 = pd.concat(df_c3_list, axis=1) col_c3 = df_c3.columns.tolist() df_x2_list.append(df_c3) df_list.append(df_c3) if df_d3_list: df_d3 = pd.concat(df_d3_list, axis=1) col_d3 = df_d3.columns.tolist() df_x2_list.append(df_d3) df_list.append(df_d3) if df_e3_list: df_e3 = pd.concat(df_e3_list, axis=1) col_e3 = df_e3.columns.tolist() df_x2_list.append(df_e3) df_list.append(df_e3) if df_x1_list: df_x1 = pd.concat(df_x1_list, axis=1) col_x1 = df_x1.columns.tolist() if df_x2_list: df_x2 = pd.concat(df_x2_list, axis=1) col_x2 = df_x2.columns.tolist() # create f1 - abs differences and sums of b1 without repetitions # TO DO: calculate f1 # create x - ratios of any of {a_i, b_i} i=1,2,3 # with any of {c3, d3, e3} - typo in the PRL - no a3 # total = (4+4+6+12+12+30)*(21+21+21) = 68*63 = 4284 # for subset in itertools.combinations(col_a3, 1): # if 'exp' in allowed_operations: # cols = ['exp('+subset[0]+'^2)'] # data = df_a3[list(subset)].apply(_my_exp_power_2, axis=1) # df_e3_list.append(pd.DataFrame(data, columns=cols)) if df_x1_list and df_x2_list: for el_x1 in col_x1: for el_x2 in col_x2: if '/' in allowed_operations: cols = [el_x1 + '/' + el_x2] # now the operation is between two dataframes data = df_x1[el_x1].divide(df_x2[el_x2]) df_x_list.append(pd.DataFrame(data, columns=cols)) if df_f1_list: df_f1 = pd.concat(df_f1_list, axis=1) col_f1 = df_f1.columns.tolist() df_list.append(df_f1) if df_x_list: df_x = pd.concat(df_x_list, axis=1) col_x = df_x.columns.tolist() df_list.append(df_x) logger.debug('\n l1-l0 feature creation') if not df_a0.empty: logger.debug('Number of features in subgroup a0: {0}'.format(df_a0.shape[1])) logger.debug('Example of feature in subgroup a0: {0}' .format(df_a0.columns.tolist()[random.randint(0, df_a0.shape[1] - 1)])) else: logger.debug('No features in subgroup a0.') if not df_a1.empty: logger.debug('Number of features in subgroup a1: {0}'.format(df_a1.shape[1])) logger.debug('Example of feature in subgroup a1: {0}' .format(df_a1.columns.tolist()[random.randint(0, df_a1.shape[1] - 1)])) else: logger.debug('No features in subgroup a1.') if not df_a2.empty: logger.debug('Number of features in subgroup a2: {0}'.format(df_a2.shape[1])) logger.debug('Example of feature in subgroup a2: {0}' .format(df_a2.columns.tolist()[random.randint(0, df_a2.shape[1] - 1)])) else: logger.debug('No features in subgroup a2.') if not df_a3.empty: logger.debug('Number of features in subgroup a3: {0}'.format(df_a3.shape[1])) logger.debug('Example of feature in subgroup a3: {0}' .format(df_a3.columns.tolist()[random.randint(0, df_a3.shape[1] - 1)])) else: logger.debug('No features in subgroup a3.') if not df_a4.empty: logger.debug('Number of features in subgroup a4: {0}'.format(df_a4.shape[1])) logger.debug('Example of feature in subgroup a4: {0}' .format(df_a3.columns.tolist()[random.randint(0, df_a4.shape[1] - 1)])) else: logger.debug('No features in subgroup a4.') if df_b0_list: logger.debug('Number of features in subgroup b0: {0}'.format(df_b0.shape[1])) logger.debug('Example of feature in subgroup b0: {0}' .format(df_b0.columns.tolist()[random.randint(0, df_b0.shape[1] - 1)])) else: logger.debug('No features in subgroup b0.') if df_b1_list: logger.debug('Number of features in subgroup b1: {0}'.format(df_b1.shape[1])) logger.debug('Example of feature in subgroup b1: {0}' .format(df_b1.columns.tolist()[random.randint(0, df_b1.shape[1] - 1)])) else: logger.debug('No features in subgroup b1.') if df_b2_list: logger.debug('Number of features in subgroup b2: {0}'.format(df_b2.shape[1])) logger.debug('Example of feature in subgroup b2: {0}' .format(df_b2.columns.tolist()[random.randint(0, df_b2.shape[1] - 1)])) else: logger.debug('No features in subgroup b2.') if df_b3_list: logger.debug('Number of features in subgroup b3: {0}'.format(df_b3.shape[1])) logger.debug('Example of feature in subgroup b3: {0}' .format(df_b3.columns.tolist()[random.randint(0, df_b3.shape[1] - 1)])) else: logger.debug('No features in subgroup b3.') if df_c3_list: logger.debug('Number of features in subgroup c3: {0}'.format(df_c3.shape[1])) logger.debug('Example of feature in subgroup c3: {0}' .format(df_c3.columns.tolist()[random.randint(0, df_c3.shape[1] - 1)])) else: logger.debug('No features in subgroup c3.') if df_d3_list: logger.debug('Number of features in subgroup d3: {0}'.format(df_d3.shape[1])) logger.debug('Example of feature in subgroup d3: {0}' .format(df_d3.columns.tolist()[random.randint(0, df_d3.shape[1] - 1)])) else: logger.debug('No features in subgroup d3.') if df_e3_list: logger.debug('Number of features in subgroup e3: {0}'.format(df_e3.shape[1])) logger.debug('Example of feature in subgroup e3: {0}' .format(df_e3.columns.tolist()[random.randint(0, df_e3.shape[1] - 1)])) else: logger.debug('No features in subgroup e3.') if df_f1_list: logger.debug('Number of features in subgroup f1: {0}'.format(df_f1.shape[1])) logger.debug('Example of feature in subgroup f1: {0}' .format(df_f1.columns.tolist()[random.randint(0, df_f1.shape[1] - 1)])) else: logger.debug('No features in subgroup f1.') if df_x_list: logger.debug('Number of features in subgroup x: {0}'.format(df_x.shape[1])) logger.debug('Example of feature in subgroup x: {0}' .format(df_x.columns.tolist()[random.randint(0, df_x.shape[1] - 1)])) else: logger.debug('No features in subgroup x.') logger.debug('Please see Phys. Rev. Lett. 114, 105503(2015) Supplementary Information \n for more details.\n') if df_list: df_combined_features = pd.concat(df_list, axis=1) else: logger.error('No features selected. Please select at least two primary features.') sys.exit(1) logger.info('Number of total features generated: {0}'.format(df_combined_features.shape[1])) return df_combined_features
[docs]def l1_l0_minimization(y_true, D, features, energy_unit=None, print_lasso=False, lambda_grid=None, lassonumber=25, max_dim=3, lambda_grid_points=100, lambda_max_factor=1.0, lambda_min_factor=0.001): """ Select an optimal descriptor using a combined l1-l0 procedure. 1. step (l 1): Solve the LASSO minimization problem .. math:: argmin_c {||P-Dc||^2 + \lambda |c|_1} for different lambdas, starting from a 'high' lambda. Collect all indices(Features) i appearing with nonzero coefficients c_i, while decreasing lambda, until size of collection equals `lassonumber`. 2. step (l 0): Check the least-squares errors for all single features/pairs/triples/... of collection from 1. step. Choose the single/pair/triple/... with the lowest mean squared error (MSE) to be the best 1D/2D/3D-descriptor. Parameters: y_true : array, [n_samples] Array with the target property (ground truth) D : array, [n_samples, n_features] Matrix with the data. features : list of strings List of feature names. Needs to be in the same order as the feature vectors in D dimrange : list of int Specify for which dimensions the optimal descriptor is calculated. It is the number of feature vectors used in the linear combination lassonumber : int, default 25 The number of features, which will be collected in ther l1-step lamdba_grid_points : int, default 100 Number of lamdbas between lamdba_max and lambdba_min for which the l1-problem shall be solved. Sometimes a denser grid could be needed, if the lamda-steps are too high. This can be checked with 'print_lasso'. `lamdba_max` and `lamdba_min` are chosen as in Tibshirani's paper "Regularization Paths for Generalized Linear Models via Coordinate Descent". The values in between are generated on the log scale. lambda_min_factor : float, default 0.001 Sets `lam_min` = `lambda_min_factor` * `lam_max`. lambda_max_factor : float, default 1.0 Sets calculated `lam_max` = `lam_max` * `lambda_max_factor`. print_lasso: bool, default `True` Prints the indices of coulumns of `D` with nonzero coefficients for each lambda. lambda_grid: array The list/array of lambda values for the l1-problem can be chosen by the user. The list/array should start from the highest number and lambda_i > lamda_i+1 should hold. (?) `lambda_grid_point` is then ignored. (?) Returns: list of panda dataframes (D', c', selected_features) : A list of tuples (D',c',selected_features) for each dimension. `selected_features` is a list of strings. D'*c' is the selected linear model/fit where the last column of `D` is a vector with ones. References: .. [1] Luca M. Ghiringhelli, Jan Vybiral, Sergey V. Levchenko, Claudia Draxl, and Matthias Scheffler, "Big Data of Materials Science: Critical Role of the Descriptor" Phys. Rev. Lett. 114, 105503 (2015) """ dimrange = range(1, max_dim + 1) compounds = len(y_true) # standardize D Dstan = np.array(scipy.stats.zscore(D)) y_true = y_true.flatten() #lambda_grid=[pow(1.7,i) for i in np.arange(-40.,-1,1.)] # lambda_grid.sort(reverse=True) logger.info('Selecting optimal descriptors.') if lambda_grid is None: # find max lambda, and build lambda grid as in # Tibshirani's paper "Regularization Paths for Generalized Linear # Models via Coordinate Descent". Here lam_max can be set to a higher # with a factor lambda_max_factor correlations = abs(np.dot(y_true, Dstan)) correlations = np.asarray(correlations) lam_max = max(correlations) / (compounds) lam_min = lam_max * lambda_min_factor lam_max = lambda_max_factor * lam_max log_max, log_min = np.log10(lam_max), np.log10(lam_min) lambda_grid = [pow(10, i) for i in np.linspace(log_min, log_max, lambda_grid_points)] lambda_grid.sort(reverse=True) # LASSO begin, iter over lamda grid, and collect all indices(Features) # with nonzero coefficient until len(collection)=lassonumber collection = [] if print_lasso: logger.debug('lambda #collected Indices') for l, lam in enumerate(lambda_grid): lasso = linear_model.Lasso(alpha=lam, copy_X=True, fit_intercept=True, max_iter=100000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False) lasso.fit(Dstan, y_true) coef = lasso.coef_ for pos in np.nonzero(coef)[0]: if not pos in collection: collection.append(pos) if print_lasso: # print the indices of nonzero coefficients for a given lambda. # (It is NOT the collection at that moment) logger.debug('%.10f %s %s' % (lam, len(collection), np.nonzero(coef)[0])) if len(collection) > lassonumber - 1: break collection = sorted(collection[:lassonumber]) # LASSO end # collection is the list with the features that have been collected len_collection = len(collection) if len_collection < lassonumber: logger.debug("Only %s features are collected" % len_collection) # make small matrix with size of (compounds,lassonumber), only with selected features from LASSO D_collection = D[:, collection] D_collection = np.column_stack((D_collection, np.ones(compounds))) # get the different dimensional descriptor and save the # tuple (D_model, coefficients, selected_features) for each dimension in the list out out = [] out_df = [] y_pred = [] for dimension in dimrange: # L0: save for each single Feature/ pair, triple/... the Least-Squares-Error # with its coefficient and index in Dictionary MSEdic MSEdic = {} for permu in itertools.combinations(range(len_collection), dimension): D_ls = D_collection[:, permu + (-1,)] x = np.linalg.lstsq(D_ls, y_true, rcond=None) # if there are linear dependencies in D_ls np.linalg.lstsq gives no error and len(x[1])==0. if not len(x[1]) == 0: # (There could be other reasons, too...which should/could be checked) MSE = x[1][0] / compounds MSEdic.update({MSE: [x[0], permu]}) # check if MSEdic is empty if not bool(MSEdic): logger.error('Could not find configuration with lowest MSE.\n Try to select ' + 'more features\n or reduce the number of the descriptor dimension. ') sys.exit(1) # select the model with the lowest MSE minimum = min(MSEdic) logger.info("Root Mean Squared Error (RMSE) for {0}D descriptor: {1:.6e} {2}" .format(dimension, sqrt(minimum), energy_unit)) model = MSEdic[minimum] coefficients, good_permu = model[0], model[1] # transform the D_collection-indices into D-indices, and get strings for selected features # from the list 'features' , and get D_model selected_features = [features[collection[gp]] for gp in good_permu] D_model = D_collection[:, good_permu + (-1,)] # save the model for the actual dimension and strings of features out.append((D_model, coefficients, selected_features)) # print in terminal string = '{0}D case: \n'.format(dimension) for i in range(dimension + 1): if coefficients[i] > 0: sign = '+' c = coefficients[i] else: sign = '-' c = abs(coefficients[i]) if i < dimension: string += '%s %.6e %s\n ' % (sign, c, selected_features[i]) else: string += '%s %.6e\n' % (sign, c) logger.info(string) # calculate E_predict y_pred.append(np.dot(D_model, coefficients)) # RMSE (it should be the same as before): sqrt(mean_squared_error(P, P_pred)) # create panda dataframe selected_features.append('Intercept') data = D_model out_df.append(pd.DataFrame(data, columns=selected_features)) return out_df, y_pred, y_true