Source code for ai4materials.models.cnn_polycrystals

# coding=utf-8
# Copyright 2016-2018 Angelo Ziletti
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

__author__ = "Angelo Ziletti"
__copyright__ = "Copyright 2018, Angelo Ziletti"
__maintainer__ = "Angelo Ziletti"
__email__ = "ziletti@fhi-berlin.mpg.de"
__date__ = "23/09/18"

from keras import backend as K
K.set_image_data_format("channels_last")

from keras.callbacks import CSVLogger
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.preprocessing.image import ImageDataGenerator
from ai4materials.utils.utils_plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from scipy import stats
from keras.optimizers import Adam
from keras.utils import np_utils
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
import logging
import numpy as np
import os
import pandas as pd
from sklearn.metrics import accuracy_score
from collections import defaultdict

logger = logging.getLogger('ai4materials')


[docs]def train_neural_network(x_train, y_train, x_val, y_val, configs, partial_model_architecture, batch_size=32, nb_epoch=5,
                         normalize=True, checkpoint_dir=None, neural_network_name='my_neural_network',
                         training_log_file='training.log', early_stopping=False, data_augmentation=True):
    """Train a neural network to classify crystal structures represented as two-dimensional diffraction fingerprints.

    This model was introduced in [1]_.

    x_train: np.array, [batch, width, height, channels]


    .. [1] A. Ziletti, A. Leitherer, M. Scheffler, and L. M. Ghiringhelli,
        “Crystal-structure identification via Bayesian deep learning: towards superhuman performance”,
        in preparation (2018)

    .. codeauthor:: Angelo Ziletti <angelo.ziletti@gmail.com>
    """

    if checkpoint_dir is None:
        checkpoint_dir = configs['io']['results_folder']

    filename_no_ext = os.path.abspath(os.path.normpath(os.path.join(checkpoint_dir, neural_network_name)))

    training_log_file_path = os.path.abspath(os.path.normpath(os.path.join(checkpoint_dir, training_log_file)))

    # reshape to follow the image conventions
    # - TensorFlow backend: [batch, width, height, channels]
    # - Theano backend: [batch, channels, width, height]
    # x_train = reshape_images_to_theano(x_train)
    # x_val = reshape_images_to_theano(x_val)

    assert x_train.shape[1] == x_val.shape[1]
    assert x_train.shape[2] == x_val.shape[2]
    assert x_train.shape[3] == x_val.shape[3]

    img_channels = x_train.shape[1]
    img_width = x_train.shape[2]
    img_height = x_train.shape[3]

    logger.info('Loading datasets.')
    logger.debug('x_train shape: {0}'.format(x_train.shape))
    logger.debug('y_train shape: {0}'.format(y_train.shape))
    logger.debug('x_val shape: {0}'.format(x_val.shape))
    logger.debug('y_val shape: {0}'.format(y_val.shape))
    logger.debug('Training samples: {0}'.format(x_train.shape[0]))
    logger.debug('Validation samples: {0}'.format(x_val.shape[0]))
    logger.debug("Img channels: {}".format(x_train.shape[1]))
    logger.debug("Img width: {}".format(x_train.shape[2]))
    logger.debug("Img height: {}".format(x_train.shape[3]))

    x_train = x_train.astype('float32')
    x_val = x_val.astype('float32')

    # normalize each image separately
    if normalize:
        for idx in range(x_train.shape[0]):
            x_train[idx, :, :, :] = (x_train[idx, :, :, :] - np.amin(x_train[idx, :, :, :])) / (
                    np.amax(x_train[idx, :, :, :]) - np.amin(x_train[idx, :, :, :]))
        for idx in range(x_val.shape[0]):
            x_val[idx, :, :, :] = (x_val[idx, :, :, :] - np.amin(x_val[idx, :, :, :])) / (
                    np.amax(x_val[idx, :, :, :]) - np.amin(x_val[idx, :, :, :]))

    # check if the image is already normalized
    logger.info(
        'Maximum value in x_train for the 1st image (to check normalization): {0}'.format(np.amax(x_train[0, :, :, :])))
    logger.info(
        'Maximum value in x_val for the 1st image (to check normalization): {0}'.format(np.amax(x_val[0, :, :, :])))

    # convert class vectors to binary class matrices
    nb_classes = len(set(y_train))
    nb_classes_val = len(set(y_val))

    if nb_classes_val != nb_classes:
        raise ValueError("Different number of unique classes in training and validation set: {} vs {}."
                         "Training set unique classes: {}"
                         "Validation set unique classes: {}".format(nb_classes, nb_classes_val, set(y_train),
                                                                    set(y_val)))

    y_train = np_utils.to_categorical(y_train, nb_classes)
    y_val = np_utils.to_categorical(y_val, nb_classes)

    logger.info('Loading and formatting of data completed.')

    # return the Keras model
    model = partial_model_architecture(n_rows=img_width, n_columns=img_height, img_channels=img_channels,
                                       nb_classes=nb_classes)

    model.summary()
    # serialize model to JSON
    model_json = model.to_json()
    with open(filename_no_ext + ".json", "w") as json_file:
        json_file.write(model_json)

    callbacks = []
    csv_logger = CSVLogger(training_log_file_path, separator=',', append=False)
    save_model_per_epoch = ModelCheckpoint(filename_no_ext + ".h5", monitor='val_acc', verbose=1, save_best_only=True,
                                           mode='max', period=1)
    callbacks.append(csv_logger)
    callbacks.append(save_model_per_epoch)

    # if you are running on Notebook
    if configs['runtime']['isBeaker']:
        callbacks.append(TQDMNotebookCallback(leave_inner=True, leave_outer=True))
    else:
        callbacks.append(TQDMCallback(leave_inner=True, leave_outer=True))

    if early_stopping:
        EarlyStopping(monitor='val_loss', min_delta=0.001, patience=1, verbose=0, mode='auto')

    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, decay=0.0)

    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

    if not data_augmentation:
        model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_data=(x_val, y_val),
                  shuffle=True, verbose=0, callbacks=callbacks)
    else:
        logger.info('Using real-time data augmentation.')

        # this will do preprocessing and realtime data augmentation
        datagen = ImageDataGenerator(featurewise_center=False,  # set input mean to 0 over the dataset
                                     samplewise_center=False,  # set each sample mean to 0
                                     featurewise_std_normalization=False,  # divide inputs by std of the dataset
                                     samplewise_std_normalization=False,  # divide each input by its std
                                     zca_whitening=False,  # apply ZCA whitening
                                     rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
                                     shear_range=0.0,  # value in radians, equivalent to 20 deg
                                     zoom_range=0.1,  # zoom_range = [1/1, 1],   #same as in NIPS 2015 paper.
                                     width_shift_range=4.0,  # randomly shift images horizontally
                                     height_shift_range=4.0,  # randomly shift images vertically
                                     horizontal_flip=False,  # randomly flip images
                                     vertical_flip=False)  # randomly flip images

        # compute quantities required for featurewise normalization
        # (std, mean, and principal components if ZCA whitening is applied)
        # Not required as it is Only required if featurewise_center or featurewise_std_normalization or zca_whitening.
        #        datagen.fit(x_train)

        # fit the model on the batches generated by datagen.flow() and save the loss and acc data history in the hist variable
        # filepath = "/home/310251680/work/scripts/imageCLEF_reprod/saved_models/model_imgCLEF_shallow_ep_10_weights.hdf5"
        # save_model_per_epoch = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=True, mode='auto')

        history = model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size),
                                      samples_per_epoch=x_train.shape[0], nb_epoch=nb_epoch,
                                      validation_data=(x_val, y_val), callbacks=callbacks, verbose=0)

    # serialize weights to HDF5
    # model.save(filename_no_ext + ".h5")
    # logger.info("Model saved to disk.")
    # logger.info("Filename: {0}".format(filename_no_ext))
    del model


[docs]def predict(x, y, configs, numerical_labels, text_labels, nb_classes=3, results_file=None, model=None, batch_size=32,
            conf_matrix_file=None, verbose=1, with_uncertainty=True, mc_samples=50, 
            consider_memory=True, max_length=1e6):
    uncertainty = None

    if results_file is None:
        results_file = configs['io']['results_file']

    if conf_matrix_file is None:
        conf_matrix_file = configs['io']['conf_matrix_file']

    # convert class vectors to binary class matrices - one-hot encoding
    y = np_utils.to_categorical(y, nb_classes)

    # get the input shape from the Keras model
    # the first element in the list is the batch size (not defined yet, it is a placeholder)
    # we keep all the indices after the first one to correctly reshape our array
    input_shape_from_model = model.layers[0].get_input_at(0).get_shape().as_list()[1:]

    # add -1 so that we can reshape properly for any number of images (or sequences) in the dataset
    target_shape = tuple([-1] + input_shape_from_model)

    x = reshape_images(x, target_shape=target_shape)
    x = x.astype('float32')

    logger.info('Loading test dataset for prediction.')
    logger.debug('x_test shape: {0}'.format(x.shape))
    logger.debug('Test samples: {0}'.format(x.shape[0]))
    logger.info('Loading and formatting of data completed.')

    if configs['runtime']['log_level_general'] == "DEBUG":
        model.summary()

    logger.info('Predicting...')

    if with_uncertainty:
        if consider_memory:
            logger.info("While considering memory limits: Using multiple passes to have principles probability and uncertainty estimates")
            uncertainty = defaultdict(list)
            prob_predictions = np.array([])
            current_size = mc_samples*x.shape[0] # x.shape[0]=batch_size
            if current_size>=max_length:
                split_size = 1
                # increase split size until  we are safe to not run into memory problems
                while (current_size/split_size)>=max_length:
                    split_size+=1
                x_splitted = np.array_split(x, split_size)
                logger.info("Had to split input of shape {} into {} subarrays".format(x.shape, split_size))
                for data, split in zip(x_splitted, np.arange(len(x_splitted))+1):
                    logger.info("Split # {} / {}".format(split,split_size))
                    prediction_, uncertainty_ = predict_with_uncertainty(data, model=model, n_iter=mc_samples)
                    if split==1:
                        prob_predictions = prediction_
                    else:
                        prob_predictions = np.concatenate((prob_predictions, prediction_), axis=0)
                    for key_ in uncertainty_:
                        uncertainty[key_].extend(uncertainty_[key_])
            else:
                logger.info("Using multiple passes to have principles probability and uncertainty estimates")
                prob_predictions, uncertainty = predict_with_uncertainty(x, model, n_iter=mc_samples)                
        else:
            logger.info("Using multiple passes to have principles probability and uncertainty estimates")
            prob_predictions, uncertainty = predict_with_uncertainty(x, model, n_iter=mc_samples)
    else:
        logger.info("Using only a single pass. No uncertainty estimation.")
        prob_predictions = model.predict(x, batch_size=batch_size, verbose=verbose)

    # get the argmax to have the class label from the probabilities
    y_pred = prob_predictions.argmax(axis=-1)
    logger.info("Accuracy: {}%".format(accuracy_score(np.argmax(y, axis=1), y_pred) * 100.))

    conf_matrix = confusion_matrix(np.argmax(y, axis=1), y_pred)
    np.set_printoptions(precision=2)
    logger.info('Confusion matrix, without normalization: ')
    logger.info(conf_matrix)

    target_pred_class = np.argmax(prob_predictions, axis=1).tolist()
    # predictions are an (n, m) array where
    # n: # of samples, m: # of classes
    # create dataframe with results
    df_cols = ['target_pred_class']
    for idx in range(prob_predictions.shape[1]):
        df_cols.append('prob_predictions_' + str(idx))
    df_cols.append('num_labels')
    df_cols.append('class_labels')
    # df_cols.append('predictive')

    data = np.column_stack((target_pred_class, prob_predictions, numerical_labels, text_labels))

    # make a dataframe with the results and write it to file
    if uncertainty is not None:
        for key, value in uncertainty.items():
            data = np.column_stack((data, value))
            df_cols.append('uncertainty_' + str(key))

    df_results = pd.DataFrame(data=data, columns=df_cols)
    df_results.to_csv(results_file, index=False)
    logger.info("Predictions written to: {}".format(results_file))

    text_labels = text_labels.tolist()
    unique_class_labels = sorted(list(set(text_labels)))

    #plot_confusion_matrix(conf_matrix, conf_matrix_file=conf_matrix_file, classes=unique_class_labels, normalize=False,
    #                      title='Confusion matrix')
    #logger.info("Confusion matrix written to {}.".format(conf_matrix_file))

    # transform it in a list of n strings to be used by the viewer
    string_probs = [str(['p' + str(i) + ':{0:.4f} '.format(item[i]) for i in range(nb_classes)]) for item in
                    prob_predictions]

    # insert new line if string too long
    # for item in target_pred_probs:
    #    item = insert_newlines(item, every=10)

    results = dict(target_pred_class=target_pred_class, prob_predictions=prob_predictions, confusion_matrix=conf_matrix,
                   string_probs=string_probs, uncertainty=uncertainty)

    return results


[docs]def predict_with_uncertainty(data, model, model_type='classification', n_iter=1000):
    """This function allows to calculate the uncertainty of a neural network model using dropout.

    This follows Chap. 3 in Yarin Gal's PhD thesis:
    http://mlg.eng.cam.ac.uk/yarin/thesis/thesis.pdf

    We calculate the uncertainty of the neural network predictions in the three ways proposed in Gal's PhD thesis,
     as presented at pag. 51-54:
    - variation_ratio: defined in Eq. 3.19
    - predictive_entropy: defined in Eq. 3.20
    - mutual_information: defined at pag. 53 (no Eq. number)

    .. codeauthors:: Angelo Ziletti <angelo.ziletti@gmail.com>, Andreas Leitherer <andreas.leitherer@gmail.com>

    """

    logger.info("Calculating classification uncertainty.")
    
    # Faster implementation:
    repeated_data = np.concatenate([data for _ in range(n_iter)])
    result = model.predict(repeated_data, verbose=0)
    results = np.array(np.array_split(result, n_iter))
    labels = results.argmax(axis=-1)
    
    # previous implementation: 
    # run through each point iteratively:
    """
    labels = []
    results = []
    for idx_iter in range(n_iter):
        if (idx_iter % (int(n_iter) / 10 + 1)) == 0:
            logger.info("Performing forward pass: {0}/{1}".format(idx_iter + 1, n_iter))

        result = model.predict(data, verbose=0)
        label = result.argmax(axis=-1)

        labels.append(label)
        results.append(result)
    """  
    results = np.asarray(results)
    # less memory intensive:
    #if len(results)>25000*1000:
    #    dtype_ = np.float16
    #else:
    #    dtype_ = None
    #results = np.asarray(results, dtype=dtype_)
    prediction = results.mean(axis=0)

    if model_type == 'regression':
        predictive_variance = results.var(axis=0)
        uncertainty = dict(predictive_variance=predictive_variance)

    elif model_type == 'classification':
        # variation ratio
        #print(labels)
        mode, mode_count = stats.mode(np.asarray(labels), keepdims=True)
        #print(mode)
        #print(mode_count)
        variation_ratio = np.transpose(1. - mode_count.mean(axis=0) / float(n_iter))
        #print(variation_ratio)
        # predictive entropy
        # clip values to 1e-12 to avoid divergency in the log
        prediction = np.clip(prediction, a_min=1e-12, a_max=None, out=prediction)
        log_p_class = np.log2(prediction)
        entropy_all_iteration = - np.multiply(prediction, log_p_class)
        predictive_entropy = np.sum(entropy_all_iteration, axis=1)

        # mutual information
        # clip values to 1e-12 to avoid divergency in the log
        results = np.clip(results, a_min=1e-12, a_max=None, out=results)
        p_log_p_all = np.multiply(np.log2(results), results)
        exp_p_omega = np.sum(np.sum(p_log_p_all, axis=0), axis=1)
        mutual_information = predictive_entropy + 1. / float(n_iter) * exp_p_omega

        uncertainty = dict(variation_ratio=variation_ratio, predictive_entropy=predictive_entropy,
                           mutual_information=mutual_information)
    else:
        raise ValueError("Supported model types are 'classification' or 'regression'."
                         "model_type={} is not accepted.".format(model_type))

    return prediction, uncertainty


[docs]def reshape_images(images, target_shape):
    """Reshape images according to the target shape"""

    images = np.reshape(images, target_shape)
    logger.debug("Reshaping to {}".format(target_shape))

    return images


[docs]def normalize_images(images):
    input_dims = (images.shape[1], images.shape[2])

    if len(input_dims) == 2:
        for idx in range(images.shape[0]):
            images[idx, :, :, :] = (images[idx, :, :, :] - np.amin(images[idx, :, :, :])) / (
                    np.amax(images[idx, :, :, :]) - np.amin(images[idx, :, :, :]))
    else:
        for idx in range(images.shape[0]):
            images[idx, :, :, :] = (images[idx, :, :, :, :] - np.amin(images[idx, :, :, :, :])) / (
                    np.amax(images[idx, :, :, :, :]) - np.amin(images[idx, :, :, :, :]))

    return images