Source code for ai4materials.models.l1_l0

# coding=utf-8
# Copyright 2016-2018 Angelo Ziletti
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

__author__ = "Angelo Ziletti"
__copyright__ = "Copyright 2018, Angelo Ziletti"
__maintainer__ = "Angelo Ziletti"
__email__ = "ziletti@fhi-berlin.mpg.de"
__date__ = "23/09/18"

import logging
import os
import json
import sys
import pandas as pd
import numpy as np
import random
import math
import itertools
import scipy.stats
from sklearn import linear_model
from math import exp, sqrt
import ai4materials.utils.unit_conversion as uc
logger = logging.getLogger('ai4materials')


[docs]def choose_atomic_features(selected_feature_list=None,
                           atomic_data_file=None, binary_data_file=None):
    """Choose primary features for the extended lasso procedure."""
    df1 = pd.read_csv(atomic_data_file, index_col=False)
    df2 = pd.read_csv(binary_data_file, index_col=False)

    # merge two dataframes on Material
    df = pd.merge(df1, df2, on='Mat')

    # calculate r_sigma and r_pi [Phys. Rev. Lett. 33, 1095(1974)]
    radii_s_p = ['rp(A)', 'rs(A)', 'rp(B)', 'rs(B)']
    df['r_sigma'] = df[radii_s_p].apply(r_sigma, axis=1)
    df['r_pi'] = df[radii_s_p].apply(r_pi, axis=1)

    # calculate Es/sqrt(Zval) and Ep/sqrt(Zval)
    e_val_z = ['Es(A)', 'val(A)']
    df['Es(A)/sqrt(Zval(A))'] = df[e_val_z].apply(e_sqrt_z, axis=1)
    e_val_z = ['Es(B)', 'val(B)']
    df['Es(B)/sqrt(Zval(B))'] = df[e_val_z].apply(e_sqrt_z, axis=1)

    e_val_z = ['Ep(A)', 'val(A)']
    df['Ep(A)/sqrt(Zval(A))'] = df[e_val_z].apply(e_sqrt_z, axis=1)
    e_val_z = ['Ep(B)', 'val(B)']
    df['Ep(B)/sqrt(Zval(B))'] = df[e_val_z].apply(e_sqrt_z, axis=1)

    column_list = df.columns.tolist()
    feature_list = column_list

    if 'Mat' in feature_list:
        feature_list.remove('Mat')
    if 'Edim' in feature_list:
        feature_list.remove('Edim')

    logger.debug("Available features: \n {}".format(feature_list))

    df_selected = df[selected_feature_list]
    df_selected.insert(0, 'Mat', df['Mat'])

    if selected_feature_list:
        logger.info("Primary features selected: \n {}".format(selected_feature_list))
    else:
        logger.error("No selected features.")
        sys.exit(1)

    return df_selected


[docs]def classify_rs_zb(structure):
    """Classify if a structure is rocksalt of zincblend from a list of NoMaD structure.
    (one json file). Supports multiple frames (TO DO: check that). Hard-coded.

    rocksalt:
    atom_frac1 0.0 0.0 0.0
    atom_frac2 0.5 0.5 0.5

    zincblende:
    atom_frac1 0.0  0.0  0.0
    atom_frac2 0.25 0.25 0.25

    zincblende --> label=0
    rocksalt  --> label=1
    """
    energy = {}
    chemical_formula = {}
    label = {}

    # gIndexRun=0
    # gIndexDesc=1

    for (gIndexRun, gIndexDesc), atoms in structure.atoms.iteritems():
        if atoms is not None:
            energy[gIndexRun, gIndexDesc] = structure.energy_eV[(gIndexRun, gIndexDesc)]
            # energy=1.0
            chemical_formula[gIndexRun, gIndexDesc] = structure.chemical_formula[(gIndexRun, gIndexDesc)]
            # get labels, works only for RS/ZB dataset
            pos_atom_2 = np.asarray(list(structure.scaled_positions.values())).reshape(2, 3)[1, :]
            if all(i < 0.375 for i in pos_atom_2):
                # label='zincblend'
                label[gIndexRun, gIndexDesc] = 0
            else:
                # label='rocksalt'
                label[gIndexRun, gIndexDesc] = 1

            break

    return chemical_formula, energy, label


[docs]def get_energy_diff(chemical_formula_list, energy_list, label_list):
    """ Obtain difference in energy (eV) between rocksalt and zincblend structures of a given binary.

    From a list of chemical formulas, energies and labels returns a dictionary
    with {`material`: `delta_e`} where `delta_e` is the difference between the energy
    with label 1 and energy with label 0, grouped by material.
    Each element of such list corresponds to a json file.
    The `delta_e` is exactly what reported in the PRL 114, 105503(2015).


    .. todo:: Check if it works for multiple frames.


    """
    energy_ = []
    chemical_formula_ = []
    label_ = []

    # energy and chemical formula are lists even if only one frame is present
    for i, energy_i in enumerate(energy_list):
        energy_.append(energy_i.values())

    for i, chemical_formula_i in enumerate(chemical_formula_list):
        chemical_formula_.append(chemical_formula_i.values())

    for i, label_i in enumerate(label_list):
        label_.append(label_i.values())

    # flatten the lists
    energy = list(itertools.chain(*energy_))
    chemical_formula = list(itertools.chain(*chemical_formula_))
    label = list(itertools.chain(*label_))

    df = pd.DataFrame()
    df['Mat'] = chemical_formula
    df['Energy'] = energy
    df['Label'] = label

    # generate summary dataframe with lowest zincblend and rocksalt energy
    # zincblend --> label=0
    # rocksalt  --> label=1
    df_summary = df.sort_values(by='Energy').groupby(['Mat', 'Label'], as_index=False).first()

    groupby_mat = df_summary.groupby('Mat')

    dict_delta_e = {}

    for mat, df in groupby_mat:
        # calculate the delta_e (E_RS - E_ZB)
        energy_label_1 = df.loc[df['Label'] == 1].Energy.values
        energy_label_0 = df.loc[df['Label'] == 0].Energy.values

        # if energy_diff>0 --> rs
        # if energy_diff<0 --> zb
        if (energy_label_0 and energy_label_1):
            # single element numpy array --> convert to scalar
            energy_diff = (energy_label_1 - energy_label_0).item(0)
            # divide by 2 because it is the energy_diff for each atom
            energy_diff = energy_diff / 2.0
        else:
            logger.error(
                "Could not find all the energies needed to calculate required property for material '{0}'".format(mat))
            sys.exit(1)

        dict_delta_e.update({mat: (energy_diff, energy_label_0, energy_label_1)})

    return dict_delta_e


[docs]def get_lowest_energy_structures(structure, dict_delta_e):
    """Get lowest energy structure for each material and label type.

    Works only with two possible labels for a given material.

    .. todo:: Check if it works for multiple frames.

    """
    energy = {}
    chemical_formula = {}
    is_lowest_energy = {}

    for (gIndexRun, gIndexDesc), atoms in structure.atoms.items():
        if atoms is not None:
            energy[gIndexRun, gIndexDesc] = structure.energy_eV[gIndexRun, gIndexDesc]
            chemical_formula[gIndexRun, gIndexDesc] = structure.chemical_formula[gIndexRun, gIndexDesc]

            lowest_energy_label_0 = dict_delta_e.get(chemical_formula[gIndexRun, gIndexDesc])[1]
            lowest_energy_label_1 = dict_delta_e.get(chemical_formula[gIndexRun, gIndexDesc])[2]

            if lowest_energy_label_0 > lowest_energy_label_1:
                lowest_energy_label_01 = lowest_energy_label_1
            else:
                lowest_energy_label_01 = lowest_energy_label_0

            if energy[gIndexRun, gIndexDesc] == lowest_energy_label_01:
                is_lowest_energy[gIndexRun, gIndexDesc] = True
            else:
                is_lowest_energy[gIndexRun, gIndexDesc] = False

    return is_lowest_energy


[docs]def write_atomic_features(structure, selected_feature_list, df, dict_delta_e=None,
                          path=None, filename_suffix='.json', json_file=None):
    """Given the chemical composition, build the descriptor made of atomic features only.

    Includes all the frames in the same json file.

    .. todo:: Check if it works for multiple frames.
    """

    # make dictionary {primary_feature: value} for each structure
    # dictionary of a dictionary, key: Mat, value: atomic_features
    dict_features = df.set_index('chemical_formula').T.to_dict()

    # label=0: rocksalt, label=1: zincblend
    #chemical_formula_, energy_, label_ = classify_rs_zb(structure)

    #is_lowest_energy_ = get_lowest_energy_structures(structure, dict_delta_e)

    if structure.isPeriodic == True:
        for (gIndexRun, gIndexDesc), atoms in structure.atoms.items():
            if atoms is not None:
                # filename is the normalized absolute path
                filename = os.path.abspath(os.path.normpath(os.path.join(path,
                                                                         '{0}{1}'.format(structure.name, filename_suffix))))

                outF = file(filename, 'w')
                outF.write("""
        {
              "data":[""")

                cell = structure.atoms[gIndexRun, gIndexDesc].get_cell()
                cell = np.transpose(cell)
                atoms = structure.atoms[gIndexRun, gIndexDesc]
                chemical_formula = structure.chemical_formula_[gIndexRun, gIndexDesc]
                energy = structure.energy_eV[gIndexRun, gIndexDesc]
                label = label_[gIndexRun, gIndexDesc]
                #target = dict_delta_e.get(chemical_formula_[gIndexRun, gIndexDesc])[0]
                target = dict_delta_e.get(chemical_formula)

                atomic_features = dict_features[structure.chemical_formula[gIndexRun, gIndexDesc]]
                #is_lowest_energy = is_lowest_energy_[gIndexRun,gIndexDesc]

                res = {
                    "checksum": structure.name,
                    "label": label,
                    "energy": energy,
                    #"is_lowest_energy": is_lowest_energy,
                    "delta_e_rs_zb": target,
                    "chemical_formula": chemical_formula,
                    "gIndexRun": gIndexRun,
                    "gIndexDesc": gIndexDesc,
                    "cell": cell.tolist(),
                    "particle_atom_number": map(lambda x: x.number, atoms),
                    "particle_position": map(lambda x: [x.x, x.y, x.z], atoms),
                    "atomic_features": atomic_features,
                    "main_json_file_name": json_file,
                }

                json.dump(res, outF, indent=2)
                outF.write("""
        ] }""")
                outF.flush()

    return filename


[docs]def r_sigma(row):
    """Calculates r_sigma.

    John-Bloch's indicator1: |rp(A) + rs(A) - rp(B) -rs(B)| from Phys. Rev. Lett. 33, 1095 (1974).

    Input rp(A), rs(A), rp(B), rs(B)
    They need to be given in this order.

    """
    return abs(row[0] + row[1] - row[2] + row[3])


[docs]def r_pi(row):
    """Calculates r_pi.

    John-Bloch's indicator2: |rp(A) - rs(A)| +| rp(B) -rs(B)| from Phys. Rev. Lett. 33, 1095 (1974).
    Input rp(A), rs(A), rp(B), rs(B)
    They need to be given in this order.
    combine_features
    """
    return abs(row[0] - row[1]) + abs(row[2] - row[3])


[docs]def e_sqrt_z(row):
    """Calculates e/sqrt(val_Z).

    Es/sqrt(Zval) and Ep/sqrt(Zval) from Phys. Rev. B 85, 104104 (2012).
    Input Es(A) or Ep(A), val(A)  (A-->B)
    They need to be given in this order.

    """
    return row[0] / math.sqrt(row[1])


def _get_scaling_factors(columns, metadata_info, energy_unit, length_unit):
    """Calculates characteristic energy and length, given an atomic metadata"""
    scaling_factor = []
    if columns is not None:
        for col in columns:
            try:
                col_unit = metadata_info[col.split('(', 1)[0]]['units']

                # check allowed values, to avoid problem with substance - NOT IDEAD
                if col_unit == 'J':
                    scaling_factor.append(uc.convert_unit(1, energy_unit, target_unit='eV'))
                    # divide all column by e_0
                    #df.loc[:, col] *= e_0
                elif col_unit == 'm':
                    scaling_factor.append(uc.convert_unit(1, length_unit, target_unit='angstrom'))
                    # divide all column by e_0
                    #df.loc[:, col] *= d_0
                else:
                    scaling_factor.append(1.0)
                    logger.debug("Feature units are not energy nor lengths. "
                                 "No scale to characteristic length.")
            except BaseException:
                scaling_factor.append(1.0)
                logger.debug("Feature units not included in metadata")

    return scaling_factor


def _my_power_2(row):
    return pow(row[0], 2)


def _my_power_3(row):
    return pow(row[0], 3)


def _my_power_m1(row):
    return pow(row[0], -1)


def _my_power_m2(row):
    return pow(row[0], -2)


def _my_power_m3(row):
    return pow(row[0], -3)


def _my_abs_sqrt(row):
    return math.sqrtabs(abs(row[0]))


def _my_exp(row):
    return exp(row[0])


def _my_exp_power_2(row):
    return exp(pow(row[0], 2))


def _my_exp_power_3(row):
    return exp(pow(row[0], 3))


def _my_sum(row):
    return row[0] + row[1]


def _my_abs_sum(row):
    return abs(row[0] + row[1])


def _my_abs_diff(row):
    return abs(row[0] - row[1])


def _my_diff(row):
    return row[0] - row[1]


def _my_div(row):
    return row[0] / row[1]


def _my_sum_power_2(row):
    return pow((row[0] + row[1]), 2)


def _my_sum_power_3(row):
    return pow((row[0] + row[1]), 3)


def _my_sum_exp(row):
    return exp(row[0] + row[1])


def _my_sum_exp_power_2(row):
    return exp(pow(row[0] + row[1], 2))


def _my_sum_exp_power_3(row):
    return exp(pow(row[0] + row[1], 3))


[docs]def combine_features(df=None, energy_unit=None, length_unit=None,
                     metadata_info=None, allowed_operations=None, derived_features=None):
    """Generate combination of features given a dataframe and a list of allowed operations.

    For the exponentials, we introduce a characteristic energy/length
    converting the
    ..todo:: Fix under/overflow errors, and introduce handling of exceptions.

    """

    if allowed_operations:
        logger.info('Selected operations:\n {0}'.format(allowed_operations))
    else:
        logger.warning('No allowed operations selected.')

    # make derived features
    if derived_features is not None:
        if 'r_sigma' in derived_features:
            # calculate r_sigma and r_pi [Phys. Rev. Lett. 33, 1095(1974)]
            logger.info('Including rs and rp to allow r_sigma calculation')
            radii_s_p = ['atomic_rp_max(A)', 'atomic_rs_max(A)', 'atomic_rp_max(B)', 'atomic_rs_max(B)']
            df['r_sigma'] = df[radii_s_p].apply(r_sigma, axis=1)

        if 'r_pi' in derived_features:
            logger.info('Including rs and rp to allow r_pi calculation')
            radii_s_p = ['atomic_rp_max(A)', 'atomic_rs_max(A)', 'atomic_rp_max(B)', 'atomic_rs_max(B)']
            df['r_pi'] = df[radii_s_p].apply(r_pi, axis=1)

    # calculate Es/sqrt(Zval) and Ep/sqrt(Zval)
#    e_val_z = ['Es(A)', 'val(A)']
#    df['Es(A)/sqrt(Zval(A))'] = df[e_val_z].apply(e_sqrt_z, axis=1)
#    e_val_z = ['Es(B)', 'val(B)']
#    df['Es(B)/sqrt(Zval(B))'] = df[e_val_z].apply(e_sqrt_z, axis=1)
#
#    e_val_z = ['Ep(A)', 'val(A)']
#    df['Ep(A)/sqrt(Zval(A))'] = df[e_val_z].apply(e_sqrt_z, axis=1)
#    e_val_z = ['Ep(B)', 'val(B)']
#    df['Ep(B)/sqrt(Zval(B))'] = df[e_val_z].apply(e_sqrt_z, axis=1)

    columns_ = df.columns.tolist()

    # define subclasses of features (see Phys. Rev. Lett. 114, 105503(2015) Supp. info. pag.1)
    # make a dictionary {feature: subgroup}
    # features belonging to a0 will not be combined, just added at the end

#    dict_features = {
#        u'val(B)': 'a0', u'val(A)': 'a0',
#
#        u'period__el0':'a0',
#        u'period__el1':'a0',
#        u'atomic_number__el0': 'a0',
#        u'atomic_number__el1': 'a0',
#        u'group__el0': 'a0',
#        u'group__el1': 'a0',
#
#        u'atomic_ionization_potential__el0': 'a1',
#        u'atomic_ionization_potential__el1': 'a1',
#        u'atomic_electron_affinity__el0': 'a1',
#        u'atomic_electron_affinity__el1': 'a1',
#        u'atomic_homo_lumo_diff__el0': 'a1',
#        u'atomic_homo_lumo_diff__el1': 'a1',
#        u'atomic_electronic_binding_energy_el0': 'a1',
#        u'atomic_electronic_binding_energy_el1': 'a1',
#
#
#        u'HOMO(A)': 'a2', u'LUMO(A)': 'a2', u'HOMO(B)': 'a2', u'LUMO(B)': 'a2',
#        u'HL_gap_AB': 'a2',
#        u'Ebinding_AB': 'a2',
#
#        u'atomic_rs_max__el0': 'a3',
#        u'atomic_rs_max__el1': 'a3',
#        u'atomic_rp_max__el0': 'a3',
#        u'atomic_rp_max__el1': 'a3',
#        u'atomic_rd_max__el0': 'a3',
#        u'atomic_rd_max__el1': 'a3',
#        u'atomic_r_by_2_dimer__el0': 'a3',
#        u'atomic_r_by_2_dimer__el1': 'a3',
#
#        u'd_AB': 'a3',
#        u'r_sigma': 'a3', u'r_pi': 'a3',
#
#        u'Eh': 'a4', u'C': 'a4'
#        }

    dict_features = {
        u'period': 'a0',
        u'atomic_number': 'a0',
        u'group': 'a0',

        u'atomic_ionization_potential': 'a1',
        u'atomic_electron_affinity': 'a1',
        u'atomic_homo_lumo_diff': 'a1',
        u'atomic_electronic_binding_energy': 'a1',

        u'atomic_homo': 'a2', u'atomic_lumo': 'a2',


        u'atomic_rs_max': 'a3',
        u'atomic_rp_max': 'a3',
        u'atomic_rd_max': 'a3',
        u'atomic_r_by_2_dimer': 'a3',

        u'r_sigma': 'a3', u'r_pi': 'a3'

    }

    # standardize the data -
    # we cannot reproduce the PRL if we standardize the data
    #df_a0 = (df_a0 - df_a0.mean()) / (df_a0.max() - df_a0.min())
    #df_a1 = (df_a1 - df_a1.mean()) / (df_a1.max() - df_a1.min())
    #df_a2 = (df_a2 - df_a2.mean()) / (df_a2.max() - df_a2.min())
    #df_a3 = (df_a3 - df_a3.mean()) / (df_a3.max() - df_a3.min())
    #df_a4 = (df_a4 - df_a4.mean()) / (df_a4.max() - df_a4.min())


#    df_a0 = df[[col for col in columns_ if dict_features.get(col)=='a0']].astype('float32')
    df_a0 = df[[col for col in columns_ if dict_features.get(col.split('(', 1)[0]) == 'a0']].astype('float32')
    df_a1 = df[[col for col in columns_ if dict_features.get(col.split('(', 1)[0]) == 'a1']].astype('float32')
    df_a2 = df[[col for col in columns_ if dict_features.get(col.split('(', 1)[0]) == 'a2']].astype('float32')
    df_a3 = df[[col for col in columns_ if dict_features.get(col.split('(', 1)[0]) == 'a3']].astype('float32')
    df_a4 = df[[col for col in columns_ if dict_features.get(col.split('(', 1)[0]) == 'a4']].astype('float32')

    col_a0 = df_a0.columns.tolist()
    col_a1 = df_a1.columns.tolist()
    col_a2 = df_a2.columns.tolist()
    col_a3 = df_a3.columns.tolist()
    col_a4 = df_a4.columns.tolist()

    #  this list will at the end all the dataframes created
    df_list = []

    df_b0_list = []
    df_b1_list = []
    df_b2_list = []
    df_b3_list = []
    df_c3_list = []
    df_d3_list = []
    df_e3_list = []
    df_f1_list = []
    df_f2_list = []
    df_f3_list = []
    df_x1_list = []
    df_x2_list = []
    df_x_list = []

    # create b0: absolute differences and sums of a0
    # this is not in the PRL.
    for subset in itertools.combinations(col_a0, 2):
        if '+' in allowed_operations:
            cols = ['(' + subset[0] + '+' + subset[1] + ')']
            data = df_a0[list(subset)].apply(_my_sum, axis=1)
            df_b0_list.append(pd.DataFrame(data, columns=cols))

        if '-' in allowed_operations:
            cols = ['(' + subset[0] + '-' + subset[1] + ')']
            data = df_a0[list(subset)].apply(_my_diff, axis=1)
            df_b0_list.append(pd.DataFrame(data, columns=cols))

            cols = ['(' + subset[1] + '-' + subset[0] + ')']
            data = df_a0[list(subset)].apply(_my_diff, axis=1)
            df_b0_list.append(pd.DataFrame(data, columns=cols))

        if '|+|' in allowed_operations:
            cols = ['|' + subset[0] + '+' + subset[1] + '|']
            data = df_a0[list(subset)].apply(_my_abs_sum, axis=1)
            df_b0_list.append(pd.DataFrame(data, columns=cols))

        if '|-|' in allowed_operations:
            cols = ['|' + subset[0] + '-' + subset[1] + '|']
            data = df_a0[list(subset)].apply(_my_abs_diff, axis=1)
            df_b0_list.append(pd.DataFrame(data, columns=cols))

        if '/' in allowed_operations:
            cols = [subset[0] + '/' + subset[1]]
            data = df_a0[list(subset)].apply(_my_div, axis=1)
            df_b0_list.append(pd.DataFrame(data, columns=cols))

            cols = [subset[1] + '/' + subset[0]]
            data = df_a0[list(subset)].apply(_my_div, axis=1)
            df_b0_list.append(pd.DataFrame(data, columns=cols))

    # we kept itertools.combinations to make the code more uniform with the binary operations
    for subset in itertools.combinations(col_a0, 1):
        if '^2' in allowed_operations:
            cols = [subset[0] + '^2']
            data = df_a0[list(subset)].apply(_my_power_2, axis=1)
            df_b0_list.append(pd.DataFrame(data, columns=cols))

        if '^3' in allowed_operations:
            cols = [subset[0] + '^3']
            data = df_a0[list(subset)].apply(_my_power_3, axis=1)
            df_b0_list.append(pd.DataFrame(data, columns=cols))

        if 'exp' in allowed_operations:
            cols = ['exp(' + subset[0] + ')']
            data = df_a0[list(subset)].apply(_my_exp, axis=1)
            df_b0_list.append(pd.DataFrame(data, columns=cols))

    # create b1: absolute differences and sums of a1
    for subset in itertools.combinations(col_a1, 2):
        if '+' in allowed_operations:
            cols = ['(' + subset[0] + '+' + subset[1] + ')']
            data = df_a1[list(subset)].apply(_my_sum, axis=1)
            df_b1_list.append(pd.DataFrame(data, columns=cols))

        if '-' in allowed_operations:
            cols = ['(' + subset[0] + '-' + subset[1] + ')']
            data = df_a1[list(subset)].apply(_my_diff, axis=1)
            df_b1_list.append(pd.DataFrame(data, columns=cols))

        if '|+|' in allowed_operations:
            cols = ['|' + subset[0] + '+' + subset[1] + '|']
            data = df_a1[list(subset)].apply(_my_abs_sum, axis=1)
            df_b1_list.append(pd.DataFrame(data, columns=cols))

        if '|-|' in allowed_operations:
            cols = ['|' + subset[0] + '-' + subset[1] + '|']
            data = df_a1[list(subset)].apply(_my_abs_diff, axis=1)
            df_b1_list.append(pd.DataFrame(data, columns=cols))

    # create b2: absolute differences and sums of a2
    for subset in itertools.combinations(col_a2, 2):
        if '+' in allowed_operations:
            cols = ['(' + subset[0] + '+' + subset[1] + ')']
            data = df_a2[list(subset)].apply(_my_sum, axis=1)
            df_b2_list.append(pd.DataFrame(data, columns=cols))

        if '-' in allowed_operations:
            cols = ['(' + subset[0] + '-' + subset[1] + ')']
            data = df_a2[list(subset)].apply(_my_diff, axis=1)
            df_b2_list.append(pd.DataFrame(data, columns=cols))

        if '|+|' in allowed_operations:
            cols = ['|' + subset[0] + '+' + subset[1] + '|']
            data = df_a2[list(subset)].apply(_my_abs_sum, axis=1)
            df_b2_list.append(pd.DataFrame(data, columns=cols))

        if '|-|' in allowed_operations:
            cols = ['|' + subset[0] + '-' + subset[1] + '|']
            data = df_a2[list(subset)].apply(_my_abs_diff, axis=1)
            df_b2_list.append(pd.DataFrame(data, columns=cols))

    # create b3: absolute differences and sums of a3
    for subset in itertools.combinations(col_a3, 2):
        if '+' in allowed_operations:
            cols = ['(' + subset[0] + '+' + subset[1] + ')']
            data = df_a3[list(subset)].apply(_my_sum, axis=1)
            df_b3_list.append(pd.DataFrame(data, columns=cols))

        if '-' in allowed_operations:
            cols = ['(' + subset[0] + '-' + subset[1] + ')']
            data = df_a3[list(subset)].apply(_my_diff, axis=1)
            df_b3_list.append(pd.DataFrame(data, columns=cols))

        if '|+|' in allowed_operations:
            cols = ['|' + subset[0] + '+' + subset[1] + '|']
            data = df_a3[list(subset)].apply(_my_abs_sum, axis=1)
            df_b3_list.append(pd.DataFrame(data, columns=cols))

        if '|-|' in allowed_operations:
            cols = ['|' + subset[0] + '-' + subset[1] + '|']
            data = df_a3[list(subset)].apply(_my_abs_diff, axis=1)
            df_b3_list.append(pd.DataFrame(data, columns=cols))

    # create c3: two steps:
    # 1) squares of a3 - unary operations
    # we kept itertools.combinations to make the code more uniform with the binary operations
    for subset in itertools.combinations(col_a3, 1):
        if '^2' in allowed_operations:
            cols = [subset[0] + '^2']
            data = df_a3[list(subset)].apply(_my_power_2, axis=1)
            df_c3_list.append(pd.DataFrame(data, columns=cols))
        if '^3' in allowed_operations:
            cols = [subset[0] + '^3']
            data = df_a3[list(subset)].apply(_my_power_3, axis=1)
            df_c3_list.append(pd.DataFrame(data, columns=cols))

    # 2) squares of b3 (only sums) --> sum squared of a3
    for subset in itertools.combinations(col_a3, 2):
        if '^2' in allowed_operations:
            cols = ['(' + subset[0] + '+' + subset[1] + ')^2']
            data = df_a3[list(subset)].apply(_my_sum_power_2, axis=1)
            df_c3_list.append(pd.DataFrame(data, columns=cols))

        if '^3' in allowed_operations:
            cols = ['(' + subset[0] + '+' + subset[1] + ')^3']
            data = df_a3[list(subset)].apply(_my_sum_power_3, axis=1)
            df_c3_list.append(pd.DataFrame(data, columns=cols))

    # create d3: two steps:
    # 1) exponentials of a3 - unary operations
    # we kept itertools.combinations to make the code more uniform with the binary operations
    for subset in itertools.combinations(col_a3, 1):
        if 'exp' in allowed_operations:
            cols = ['exp(' + subset[0] + ')']
            # find scaling factor for e_0 or d_0 for scaling
            # and multiply each column by the scaling factor
            scaling_factors = _get_scaling_factors(list(subset), metadata_info, energy_unit, length_unit)
            df_subset = df_a3[list(subset)] * scaling_factors
            data = df_subset.apply(_my_exp, axis=1)
            df_d3_list.append(pd.DataFrame(data, columns=cols))

    # 2) exponentials of b3 (only sums) --> exponential of sum of a3
    for subset in itertools.combinations(col_a3, 2):
        if 'exp' in allowed_operations:
            cols = ['exp(' + subset[0] + '+' + subset[1] + ')']
            # find scaling factor for e_0 or d_0 for scaling
            # and multiply each column by the scaling factor
            scaling_factors = _get_scaling_factors(list(subset), metadata_info, energy_unit, length_unit)
            df_subset = df_a3[list(subset)] * scaling_factors
            data = df_subset.apply(_my_sum_exp, axis=1)
            df_d3_list.append(pd.DataFrame(data, columns=cols))

    # create e3: two steps:
    # 1) exponentials of squared a3 - unary operations
    # we kept itertools.combinations to make the code more uniform with the binary operations
    for subset in itertools.combinations(col_a3, 1):
        operations = {'exp', '^2'}
        if operations <= set(allowed_operations):
            cols = ['exp(' + subset[0] + '^2)']
            # find scaling factor for e_0 or d_0 for scaling
            # and multiply each column by the scaling factor
            scaling_factors = _get_scaling_factors(list(subset), metadata_info, energy_unit, length_unit)
            df_subset = df_a3[list(subset)] * scaling_factors
            data = df_subset.apply(_my_exp_power_2, axis=1)
            df_e3_list.append(pd.DataFrame(data, columns=cols))

        operations = {'exp', '^3'}
        if operations <= set(allowed_operations):
            try:
                cols = ['exp(' + subset[0] + '^3)']
                # find scaling factor for e_0 or d_0 for scaling
                # and multiply each column by the scaling factor
                scaling_factors = _get_scaling_factors(list(subset), metadata_info, energy_unit, length_unit)
                df_subset = df_a3[list(subset)] * scaling_factors
                data = df_subset.apply(_my_exp_power_3, axis=1)
                df_e3_list.append(pd.DataFrame(data, columns=cols))
            except OverflowError as e:
                logger.warning('Dropping feature combination that caused under/overflow.\n')

    # 2) exponentials of b3 (only sums) --> exponential of sum of a3
    for subset in itertools.combinations(col_a3, 2):
        operations = {'exp', '^2'}
        if operations <= set(allowed_operations):
            cols = ['exp((' + subset[0] + '+' + subset[1] + ')^2)']
            # find scaling factor for e_0 or d_0 for scaling
            # and multiply each column by the scaling factor
            scaling_factors = _get_scaling_factors(list(subset), metadata_info, energy_unit, length_unit)
            df_subset = df_a3[list(subset)] * scaling_factors
            data = df_subset.apply(_my_sum_exp_power_2, axis=1)
            df_e3_list.append(pd.DataFrame(data, columns=cols))

        operations = {'exp', '^3'}
        if operations <= set(allowed_operations):
            try:
                cols = ['exp((' + subset[0] + '+' + subset[1] + ')^3)']
                # find scaling factor for e_0 or d_0 for scaling
                # and multiply each column by the scaling factor
                scaling_factors = _get_scaling_factors(list(subset), metadata_info, energy_unit, length_unit)
                df_subset = df_a3[list(subset)] * scaling_factors
                data = df_subset.apply(_my_sum_exp_power_3, axis=1)
                df_e3_list.append(pd.DataFrame(data, columns=cols))
            except OverflowError as e:
                logger.warning('Dropping feature combination that caused under/overflow.\n')

    # make dataframes from lists, check if they are not empty
    # we make there here because they are going to be used to further
    # combine the features
    if not df_a0.empty:
        df_list.append(df_a0)

    if not df_a1.empty:
        df_x1_list.append(df_a1)
        df_list.append(df_a1)

    if not df_a2.empty:
        df_x1_list.append(df_a2)
        df_list.append(df_a2)

    if not df_a3.empty:
        df_x1_list.append(df_a3)
        df_list.append(df_a3)

    if not df_a4.empty:
        df_list.append(df_a4)

    if df_b0_list:
        df_b0 = pd.concat(df_b0_list, axis=1)
        col_b0 = df_b0.columns.tolist()
        df_b0.to_csv('./df_b0.csv', index=True)
        df_list.append(df_b0)

    if df_b1_list:
        df_b1 = pd.concat(df_b1_list, axis=1)
        col_b1 = df_b1.columns.tolist()
        df_x1_list.append(df_b1)
        df_list.append(df_b1)

    if df_b2_list:
        df_b2 = pd.concat(df_b2_list, axis=1)
        col_b2 = df_b2.columns.tolist()
        df_x1_list.append(df_b2)
        df_list.append(df_b2)

    if df_b3_list:
        df_b3 = pd.concat(df_b3_list, axis=1)
        col_b3 = df_b3.columns.tolist()
        df_x1_list.append(df_b3)
        df_list.append(df_b3)

    if df_c3_list:
        df_c3 = pd.concat(df_c3_list, axis=1)
        col_c3 = df_c3.columns.tolist()
        df_x2_list.append(df_c3)
        df_list.append(df_c3)

    if df_d3_list:
        df_d3 = pd.concat(df_d3_list, axis=1)
        col_d3 = df_d3.columns.tolist()
        df_x2_list.append(df_d3)
        df_list.append(df_d3)

    if df_e3_list:
        df_e3 = pd.concat(df_e3_list, axis=1)
        col_e3 = df_e3.columns.tolist()
        df_x2_list.append(df_e3)
        df_list.append(df_e3)

    if df_x1_list:
        df_x1 = pd.concat(df_x1_list, axis=1)
        col_x1 = df_x1.columns.tolist()

    if df_x2_list:
        df_x2 = pd.concat(df_x2_list, axis=1)
        col_x2 = df_x2.columns.tolist()

    # create f1 - abs differences and sums of b1 without repetitions
    # TO DO: calculate f1

    # create x - ratios of any of {a_i, b_i} i=1,2,3
    # with any of {c3, d3, e3} - typo in the PRL - no a3
    # total = (4+4+6+12+12+30)*(21+21+21) = 68*63 = 4284
    # for subset in itertools.combinations(col_a3, 1):
    #    if 'exp' in allowed_operations:
    #        cols = ['exp('+subset[0]+'^2)']
    #        data = df_a3[list(subset)].apply(_my_exp_power_2, axis=1)
    #        df_e3_list.append(pd.DataFrame(data, columns=cols))

    if df_x1_list and df_x2_list:
        for el_x1 in col_x1:
            for el_x2 in col_x2:
                if '/' in allowed_operations:
                    cols = [el_x1 + '/' + el_x2]
                    # now the operation is between two dataframes
                    data = df_x1[el_x1].divide(df_x2[el_x2])
                    df_x_list.append(pd.DataFrame(data, columns=cols))

    if df_f1_list:
        df_f1 = pd.concat(df_f1_list, axis=1)
        col_f1 = df_f1.columns.tolist()
        df_list.append(df_f1)

    if df_x_list:
        df_x = pd.concat(df_x_list, axis=1)
        col_x = df_x.columns.tolist()
        df_list.append(df_x)

    logger.debug('\n l1-l0 feature creation')

    if not df_a0.empty:
        logger.debug('Number of features in subgroup a0: {0}'.format(df_a0.shape[1]))
        logger.debug('Example of feature in subgroup a0: {0}'
                     .format(df_a0.columns.tolist()[random.randint(0, df_a0.shape[1] - 1)]))
    else:
        logger.debug('No features in subgroup a0.')

    if not df_a1.empty:
        logger.debug('Number of features in subgroup a1: {0}'.format(df_a1.shape[1]))
        logger.debug('Example of feature in subgroup a1: {0}'
                     .format(df_a1.columns.tolist()[random.randint(0, df_a1.shape[1] - 1)]))
    else:
        logger.debug('No features in subgroup a1.')

    if not df_a2.empty:
        logger.debug('Number of features in subgroup a2: {0}'.format(df_a2.shape[1]))
        logger.debug('Example of feature in subgroup a2: {0}'
                     .format(df_a2.columns.tolist()[random.randint(0, df_a2.shape[1] - 1)]))
    else:
        logger.debug('No features in subgroup a2.')

    if not df_a3.empty:
        logger.debug('Number of features in subgroup a3: {0}'.format(df_a3.shape[1]))
        logger.debug('Example of feature in subgroup a3: {0}'
                     .format(df_a3.columns.tolist()[random.randint(0, df_a3.shape[1] - 1)]))
    else:
        logger.debug('No features in subgroup a3.')

    if not df_a4.empty:
        logger.debug('Number of features in subgroup a4: {0}'.format(df_a4.shape[1]))
        logger.debug('Example of feature in subgroup a4: {0}'
                     .format(df_a3.columns.tolist()[random.randint(0, df_a4.shape[1] - 1)]))
    else:
        logger.debug('No features in subgroup a4.')

    if df_b0_list:
        logger.debug('Number of features in subgroup b0: {0}'.format(df_b0.shape[1]))
        logger.debug('Example of feature in subgroup b0: {0}'
                     .format(df_b0.columns.tolist()[random.randint(0, df_b0.shape[1] - 1)]))
    else:
        logger.debug('No features in subgroup b0.')

    if df_b1_list:
        logger.debug('Number of features in subgroup b1: {0}'.format(df_b1.shape[1]))
        logger.debug('Example of feature in subgroup b1: {0}'
                     .format(df_b1.columns.tolist()[random.randint(0, df_b1.shape[1] - 1)]))
    else:
        logger.debug('No features in subgroup b1.')

    if df_b2_list:
        logger.debug('Number of features in subgroup b2: {0}'.format(df_b2.shape[1]))
        logger.debug('Example of feature in subgroup b2: {0}'
                     .format(df_b2.columns.tolist()[random.randint(0, df_b2.shape[1] - 1)]))
    else:
        logger.debug('No features in subgroup b2.')

    if df_b3_list:
        logger.debug('Number of features in subgroup b3: {0}'.format(df_b3.shape[1]))
        logger.debug('Example of feature in subgroup b3: {0}'
                     .format(df_b3.columns.tolist()[random.randint(0, df_b3.shape[1] - 1)]))
    else:
        logger.debug('No features in subgroup b3.')

    if df_c3_list:
        logger.debug('Number of features in subgroup c3: {0}'.format(df_c3.shape[1]))
        logger.debug('Example of feature in subgroup c3: {0}'
                     .format(df_c3.columns.tolist()[random.randint(0, df_c3.shape[1] - 1)]))
    else:
        logger.debug('No features in subgroup c3.')

    if df_d3_list:
        logger.debug('Number of features in subgroup d3: {0}'.format(df_d3.shape[1]))
        logger.debug('Example of feature in subgroup d3: {0}'
                     .format(df_d3.columns.tolist()[random.randint(0, df_d3.shape[1] - 1)]))
    else:
        logger.debug('No features in subgroup d3.')

    if df_e3_list:
        logger.debug('Number of features in subgroup e3: {0}'.format(df_e3.shape[1]))
        logger.debug('Example of feature in subgroup e3: {0}'
                     .format(df_e3.columns.tolist()[random.randint(0, df_e3.shape[1] - 1)]))
    else:
        logger.debug('No features in subgroup e3.')

    if df_f1_list:
        logger.debug('Number of features in subgroup f1: {0}'.format(df_f1.shape[1]))
        logger.debug('Example of feature in subgroup f1: {0}'
                     .format(df_f1.columns.tolist()[random.randint(0, df_f1.shape[1] - 1)]))
    else:
        logger.debug('No features in subgroup f1.')

    if df_x_list:
        logger.debug('Number of features in subgroup x: {0}'.format(df_x.shape[1]))
        logger.debug('Example of feature in subgroup x: {0}'
                     .format(df_x.columns.tolist()[random.randint(0, df_x.shape[1] - 1)]))
    else:
        logger.debug('No features in subgroup x.')

    logger.debug('Please see Phys. Rev. Lett. 114, 105503(2015) Supplementary Information \n for more details.\n')

    if df_list:
        df_combined_features = pd.concat(df_list, axis=1)
    else:
        logger.error('No features selected. Please select at least two primary features.')
        sys.exit(1)

    logger.info('Number of total features generated: {0}'.format(df_combined_features.shape[1]))

    return df_combined_features


[docs]def l1_l0_minimization(y_true, D, features,
                       energy_unit=None,
                       print_lasso=False, lambda_grid=None, lassonumber=25,
                       max_dim=3, lambda_grid_points=100, lambda_max_factor=1.0, lambda_min_factor=0.001):
    """ Select an optimal descriptor using a combined l1-l0 procedure.

    1. step (l 1): Solve the LASSO minimization problem

    .. math::
        argmin_c {||P-Dc||^2 + \lambda |c|_1}

    for different lambdas, starting from a 'high' lambda.
    Collect all indices(Features) i appearing with nonzero coefficients c_i,
    while decreasing lambda, until size of collection equals `lassonumber`.

    2. step (l 0): Check the least-squares errors for all single features/pairs/triples/... of
                   collection from 1. step. Choose the single/pair/triple/... with the lowest
                   mean squared error (MSE) to be the best 1D/2D/3D-descriptor.


    Parameters:

    y_true : array, [n_samples]
        Array with the target property (ground truth)

    D : array, [n_samples, n_features]
        Matrix with the data.

    features : list of strings
        List of feature names. Needs to be in the same order as the feature vectors in D

    dimrange : list of int
        Specify for which dimensions the optimal descriptor is calculated.
        It is the number of feature vectors used in the linear combination

    lassonumber : int, default 25
        The number of features, which will be collected in ther l1-step

    lamdba_grid_points : int, default 100
        Number of lamdbas between lamdba_max and lambdba_min for which the l1-problem shall be solved.
        Sometimes a denser grid could be needed, if the lamda-steps are too high.
        This can be checked with 'print_lasso'. `lamdba_max` and `lamdba_min` are chosen as in
        Tibshirani's paper "Regularization Paths for Generalized Linear Models via Coordinate Descent".
        The values in between are generated on the log scale.

    lambda_min_factor : float, default 0.001
        Sets `lam_min` = `lambda_min_factor` * `lam_max`.

    lambda_max_factor : float, default 1.0
        Sets calculated `lam_max` = `lam_max` * `lambda_max_factor`.

    print_lasso: bool, default `True`
        Prints the indices of coulumns of `D` with nonzero coefficients for each lambda.

    lambda_grid: array
        The list/array of lambda values for the l1-problem can be chosen by the user.
        The list/array should start from the highest number and lambda_i > lamda_i+1 should hold.
        (?) `lambda_grid_point` is then ignored. (?)

    Returns:

    list of panda dataframes
    (D', c', selected_features) :
        A list of tuples (D',c',selected_features) for each dimension.
        `selected_features` is a list of strings. D'*c' is the selected linear model/fit where the last column
        of `D` is a vector with ones.


    References:

    .. [1] Luca M. Ghiringhelli, Jan Vybiral, Sergey V. Levchenko, Claudia Draxl, and Matthias Scheffler,
        "Big Data of Materials Science: Critical Role of the Descriptor"
        Phys. Rev. Lett. 114, 105503 (2015)


    """

    dimrange = range(1, max_dim + 1)
    compounds = len(y_true)

    # standardize D
    Dstan = np.array(scipy.stats.zscore(D))

    y_true = y_true.flatten()

    #lambda_grid=[pow(1.7,i) for i in np.arange(-40.,-1,1.)]
    # lambda_grid.sort(reverse=True)

    logger.info('Selecting optimal descriptors.')

    if lambda_grid is None:
            # find max lambda, and build lambda grid as in
            # Tibshirani's paper "Regularization Paths for Generalized Linear
            # Models via Coordinate Descent". Here lam_max can be set to a higher
            # with a factor lambda_max_factor
        correlations = abs(np.dot(y_true, Dstan))
        correlations = np.asarray(correlations)

        lam_max = max(correlations) / (compounds)
        lam_min = lam_max * lambda_min_factor
        lam_max = lambda_max_factor * lam_max
        log_max, log_min = np.log10(lam_max), np.log10(lam_min)

        lambda_grid = [pow(10, i) for i in np.linspace(log_min, log_max, lambda_grid_points)]
        lambda_grid.sort(reverse=True)

    # LASSO begin, iter over lamda grid, and collect all indices(Features)
    # with nonzero coefficient until len(collection)=lassonumber
    collection = []
    if print_lasso:
        logger.debug('lambda      #collected   Indices')
    for l, lam in enumerate(lambda_grid):
        lasso = linear_model.Lasso(alpha=lam, copy_X=True, fit_intercept=True,
                                   max_iter=100000, normalize=False, positive=False, precompute=False,
                                   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
        lasso.fit(Dstan, y_true)
        coef = lasso.coef_
        for pos in np.nonzero(coef)[0]:
            if not pos in collection:
                collection.append(pos)
        if print_lasso:  # print the indices of nonzero coefficients for a given lambda.
                            # (It is NOT the collection at that moment)
            logger.debug('%.10f   %s   %s' % (lam, len(collection), np.nonzero(coef)[0]))
        if len(collection) > lassonumber - 1:
            break
    collection = sorted(collection[:lassonumber])
    # LASSO end

    # collection is the list with the features that have been collected
    len_collection = len(collection)
    if len_collection < lassonumber:
        logger.debug("Only %s features are collected" % len_collection)
    # make small matrix with size of (compounds,lassonumber), only with  selected features from LASSO
    D_collection = D[:, collection]
    D_collection = np.column_stack((D_collection, np.ones(compounds)))

    # get the different dimensional descriptor and save the
    # tuple (D_model, coefficients, selected_features) for each dimension in the list out
    out = []
    out_df = []
    y_pred = []

    for dimension in dimrange:

        # L0: save for each single Feature/ pair, triple/... the Least-Squares-Error
        # with its coefficient and index in Dictionary MSEdic
        MSEdic = {}
        for permu in itertools.combinations(range(len_collection), dimension):
            D_ls = D_collection[:, permu + (-1,)]
            x = np.linalg.lstsq(D_ls, y_true, rcond=None)
            # if there are linear dependencies in D_ls np.linalg.lstsq gives no error and len(x[1])==0.
            if not len(x[1]) == 0:
                                   # (There could be other reasons, too...which should/could be checked)
                MSE = x[1][0] / compounds
                MSEdic.update({MSE: [x[0], permu]})

        # check if MSEdic is empty
        if not bool(MSEdic):
            logger.error('Could not find configuration with lowest MSE.\n Try to select ' +
                         'more features\n or reduce the number of the descriptor dimension. ')
            sys.exit(1)

        # select the model with the lowest MSE
        minimum = min(MSEdic)

        logger.info("Root Mean Squared Error (RMSE) for {0}D descriptor: {1:.6e} {2}"
                    .format(dimension, sqrt(minimum), energy_unit))
        model = MSEdic[minimum]
        coefficients, good_permu = model[0], model[1]

        # transform the D_collection-indices into D-indices, and get strings for selected features
        # from the list 'features' , and get D_model
        selected_features = [features[collection[gp]] for gp in good_permu]
        D_model = D_collection[:, good_permu + (-1,)]

        # save the model for the actual dimension and strings of features
        out.append((D_model, coefficients, selected_features))

        # print in terminal
        string = '{0}D case: \n'.format(dimension)
        for i in range(dimension + 1):
            if coefficients[i] > 0:
                sign = '+'
                c = coefficients[i]
            else:
                sign = '-'
                c = abs(coefficients[i])
            if i < dimension:
                string += '%s %.6e %s\n ' % (sign, c, selected_features[i])
            else:
                string += '%s %.6e\n' % (sign, c)
        logger.info(string)

        # calculate E_predict
        y_pred.append(np.dot(D_model, coefficients))

        # RMSE (it should be the same as before): sqrt(mean_squared_error(P, P_pred))

        # create panda dataframe
        selected_features.append('Intercept')
        data = D_model

        out_df.append(pd.DataFrame(data, columns=selected_features))

    return out_df, y_pred, y_true