# coding=utf-8
# Copyright 2016-2018 Angelo Ziletti
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
__author__ = "Angelo Ziletti"
__copyright__ = "Copyright 2018, Angelo Ziletti"
__maintainer__ = "Angelo Ziletti"
__email__ = "ziletti@fhi-berlin.mpg.de"
__date__ = "23/09/18"
from keras import backend as K
K.set_image_data_format("channels_last")
from keras.callbacks import CSVLogger
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.preprocessing.image import ImageDataGenerator
from ai4materials.utils.utils_plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from scipy import stats
from keras.optimizers import Adam
from keras.utils import np_utils
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
import logging
import numpy as np
import os
import pandas as pd
from sklearn.metrics import accuracy_score
from collections import defaultdict
logger = logging.getLogger('ai4materials')
[docs]def train_neural_network(x_train, y_train, x_val, y_val, configs, partial_model_architecture, batch_size=32, nb_epoch=5,
normalize=True, checkpoint_dir=None, neural_network_name='my_neural_network',
training_log_file='training.log', early_stopping=False, data_augmentation=True):
"""Train a neural network to classify crystal structures represented as two-dimensional diffraction fingerprints.
This model was introduced in [1]_.
x_train: np.array, [batch, width, height, channels]
.. [1] A. Ziletti, A. Leitherer, M. Scheffler, and L. M. Ghiringhelli,
“Crystal-structure identification via Bayesian deep learning: towards superhuman performance”,
in preparation (2018)
.. codeauthor:: Angelo Ziletti <angelo.ziletti@gmail.com>
"""
if checkpoint_dir is None:
checkpoint_dir = configs['io']['results_folder']
filename_no_ext = os.path.abspath(os.path.normpath(os.path.join(checkpoint_dir, neural_network_name)))
training_log_file_path = os.path.abspath(os.path.normpath(os.path.join(checkpoint_dir, training_log_file)))
# reshape to follow the image conventions
# - TensorFlow backend: [batch, width, height, channels]
# - Theano backend: [batch, channels, width, height]
# x_train = reshape_images_to_theano(x_train)
# x_val = reshape_images_to_theano(x_val)
assert x_train.shape[1] == x_val.shape[1]
assert x_train.shape[2] == x_val.shape[2]
assert x_train.shape[3] == x_val.shape[3]
img_channels = x_train.shape[1]
img_width = x_train.shape[2]
img_height = x_train.shape[3]
logger.info('Loading datasets.')
logger.debug('x_train shape: {0}'.format(x_train.shape))
logger.debug('y_train shape: {0}'.format(y_train.shape))
logger.debug('x_val shape: {0}'.format(x_val.shape))
logger.debug('y_val shape: {0}'.format(y_val.shape))
logger.debug('Training samples: {0}'.format(x_train.shape[0]))
logger.debug('Validation samples: {0}'.format(x_val.shape[0]))
logger.debug("Img channels: {}".format(x_train.shape[1]))
logger.debug("Img width: {}".format(x_train.shape[2]))
logger.debug("Img height: {}".format(x_train.shape[3]))
x_train = x_train.astype('float32')
x_val = x_val.astype('float32')
# normalize each image separately
if normalize:
for idx in range(x_train.shape[0]):
x_train[idx, :, :, :] = (x_train[idx, :, :, :] - np.amin(x_train[idx, :, :, :])) / (
np.amax(x_train[idx, :, :, :]) - np.amin(x_train[idx, :, :, :]))
for idx in range(x_val.shape[0]):
x_val[idx, :, :, :] = (x_val[idx, :, :, :] - np.amin(x_val[idx, :, :, :])) / (
np.amax(x_val[idx, :, :, :]) - np.amin(x_val[idx, :, :, :]))
# check if the image is already normalized
logger.info(
'Maximum value in x_train for the 1st image (to check normalization): {0}'.format(np.amax(x_train[0, :, :, :])))
logger.info(
'Maximum value in x_val for the 1st image (to check normalization): {0}'.format(np.amax(x_val[0, :, :, :])))
# convert class vectors to binary class matrices
nb_classes = len(set(y_train))
nb_classes_val = len(set(y_val))
if nb_classes_val != nb_classes:
raise ValueError("Different number of unique classes in training and validation set: {} vs {}."
"Training set unique classes: {}"
"Validation set unique classes: {}".format(nb_classes, nb_classes_val, set(y_train),
set(y_val)))
y_train = np_utils.to_categorical(y_train, nb_classes)
y_val = np_utils.to_categorical(y_val, nb_classes)
logger.info('Loading and formatting of data completed.')
# return the Keras model
model = partial_model_architecture(n_rows=img_width, n_columns=img_height, img_channels=img_channels,
nb_classes=nb_classes)
model.summary()
# serialize model to JSON
model_json = model.to_json()
with open(filename_no_ext + ".json", "w") as json_file:
json_file.write(model_json)
callbacks = []
csv_logger = CSVLogger(training_log_file_path, separator=',', append=False)
save_model_per_epoch = ModelCheckpoint(filename_no_ext + ".h5", monitor='val_acc', verbose=1, save_best_only=True,
mode='max', period=1)
callbacks.append(csv_logger)
callbacks.append(save_model_per_epoch)
# if you are running on Notebook
if configs['runtime']['isBeaker']:
callbacks.append(TQDMNotebookCallback(leave_inner=True, leave_outer=True))
else:
callbacks.append(TQDMCallback(leave_inner=True, leave_outer=True))
if early_stopping:
EarlyStopping(monitor='val_loss', min_delta=0.001, patience=1, verbose=0, mode='auto')
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, decay=0.0)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
if not data_augmentation:
model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_data=(x_val, y_val),
shuffle=True, verbose=0, callbacks=callbacks)
else:
logger.info('Using real-time data augmentation.')
# this will do preprocessing and realtime data augmentation
datagen = ImageDataGenerator(featurewise_center=False, # set input mean to 0 over the dataset
samplewise_center=False, # set each sample mean to 0
featurewise_std_normalization=False, # divide inputs by std of the dataset
samplewise_std_normalization=False, # divide each input by its std
zca_whitening=False, # apply ZCA whitening
rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180)
shear_range=0.0, # value in radians, equivalent to 20 deg
zoom_range=0.1, # zoom_range = [1/1, 1], #same as in NIPS 2015 paper.
width_shift_range=4.0, # randomly shift images horizontally
height_shift_range=4.0, # randomly shift images vertically
horizontal_flip=False, # randomly flip images
vertical_flip=False) # randomly flip images
# compute quantities required for featurewise normalization
# (std, mean, and principal components if ZCA whitening is applied)
# Not required as it is Only required if featurewise_center or featurewise_std_normalization or zca_whitening.
# datagen.fit(x_train)
# fit the model on the batches generated by datagen.flow() and save the loss and acc data history in the hist variable
# filepath = "/home/310251680/work/scripts/imageCLEF_reprod/saved_models/model_imgCLEF_shallow_ep_10_weights.hdf5"
# save_model_per_epoch = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=True, mode='auto')
history = model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size),
samples_per_epoch=x_train.shape[0], nb_epoch=nb_epoch,
validation_data=(x_val, y_val), callbacks=callbacks, verbose=0)
# serialize weights to HDF5
# model.save(filename_no_ext + ".h5")
# logger.info("Model saved to disk.")
# logger.info("Filename: {0}".format(filename_no_ext))
del model
[docs]def predict(x, y, configs, numerical_labels, text_labels, nb_classes=3, results_file=None, model=None, batch_size=32,
conf_matrix_file=None, verbose=1, with_uncertainty=True, mc_samples=50,
consider_memory=True, max_length=1e6):
uncertainty = None
if results_file is None:
results_file = configs['io']['results_file']
if conf_matrix_file is None:
conf_matrix_file = configs['io']['conf_matrix_file']
# convert class vectors to binary class matrices - one-hot encoding
y = np_utils.to_categorical(y, nb_classes)
# get the input shape from the Keras model
# the first element in the list is the batch size (not defined yet, it is a placeholder)
# we keep all the indices after the first one to correctly reshape our array
input_shape_from_model = model.layers[0].get_input_at(0).get_shape().as_list()[1:]
# add -1 so that we can reshape properly for any number of images (or sequences) in the dataset
target_shape = tuple([-1] + input_shape_from_model)
x = reshape_images(x, target_shape=target_shape)
x = x.astype('float32')
logger.info('Loading test dataset for prediction.')
logger.debug('x_test shape: {0}'.format(x.shape))
logger.debug('Test samples: {0}'.format(x.shape[0]))
logger.info('Loading and formatting of data completed.')
if configs['runtime']['log_level_general'] == "DEBUG":
model.summary()
logger.info('Predicting...')
if with_uncertainty:
if consider_memory:
logger.info("While considering memory limits: Using multiple passes to have principles probability and uncertainty estimates")
uncertainty = defaultdict(list)
prob_predictions = np.array([])
current_size = mc_samples*x.shape[0] # x.shape[0]=batch_size
if current_size>=max_length:
split_size = 1
# increase split size until we are safe to not run into memory problems
while (current_size/split_size)>=max_length:
split_size+=1
x_splitted = np.array_split(x, split_size)
logger.info("Had to split input of shape {} into {} subarrays".format(x.shape, split_size))
for data, split in zip(x_splitted, np.arange(len(x_splitted))+1):
logger.info("Split # {} / {}".format(split,split_size))
prediction_, uncertainty_ = predict_with_uncertainty(data, model=model, n_iter=mc_samples)
if split==1:
prob_predictions = prediction_
else:
prob_predictions = np.concatenate((prob_predictions, prediction_), axis=0)
for key_ in uncertainty_:
uncertainty[key_].extend(uncertainty_[key_])
else:
logger.info("Using multiple passes to have principles probability and uncertainty estimates")
prob_predictions, uncertainty = predict_with_uncertainty(x, model, n_iter=mc_samples)
else:
logger.info("Using multiple passes to have principles probability and uncertainty estimates")
prob_predictions, uncertainty = predict_with_uncertainty(x, model, n_iter=mc_samples)
else:
logger.info("Using only a single pass. No uncertainty estimation.")
prob_predictions = model.predict(x, batch_size=batch_size, verbose=verbose)
# get the argmax to have the class label from the probabilities
y_pred = prob_predictions.argmax(axis=-1)
logger.info("Accuracy: {}%".format(accuracy_score(np.argmax(y, axis=1), y_pred) * 100.))
conf_matrix = confusion_matrix(np.argmax(y, axis=1), y_pred)
np.set_printoptions(precision=2)
logger.info('Confusion matrix, without normalization: ')
logger.info(conf_matrix)
target_pred_class = np.argmax(prob_predictions, axis=1).tolist()
# predictions are an (n, m) array where
# n: # of samples, m: # of classes
# create dataframe with results
df_cols = ['target_pred_class']
for idx in range(prob_predictions.shape[1]):
df_cols.append('prob_predictions_' + str(idx))
df_cols.append('num_labels')
df_cols.append('class_labels')
# df_cols.append('predictive')
data = np.column_stack((target_pred_class, prob_predictions, numerical_labels, text_labels))
# make a dataframe with the results and write it to file
if uncertainty is not None:
for key, value in uncertainty.items():
data = np.column_stack((data, value))
df_cols.append('uncertainty_' + str(key))
df_results = pd.DataFrame(data=data, columns=df_cols)
df_results.to_csv(results_file, index=False)
logger.info("Predictions written to: {}".format(results_file))
text_labels = text_labels.tolist()
unique_class_labels = sorted(list(set(text_labels)))
#plot_confusion_matrix(conf_matrix, conf_matrix_file=conf_matrix_file, classes=unique_class_labels, normalize=False,
# title='Confusion matrix')
#logger.info("Confusion matrix written to {}.".format(conf_matrix_file))
# transform it in a list of n strings to be used by the viewer
string_probs = [str(['p' + str(i) + ':{0:.4f} '.format(item[i]) for i in range(nb_classes)]) for item in
prob_predictions]
# insert new line if string too long
# for item in target_pred_probs:
# item = insert_newlines(item, every=10)
results = dict(target_pred_class=target_pred_class, prob_predictions=prob_predictions, confusion_matrix=conf_matrix,
string_probs=string_probs, uncertainty=uncertainty)
return results
[docs]def predict_with_uncertainty(data, model, model_type='classification', n_iter=1000):
"""This function allows to calculate the uncertainty of a neural network model using dropout.
This follows Chap. 3 in Yarin Gal's PhD thesis:
http://mlg.eng.cam.ac.uk/yarin/thesis/thesis.pdf
We calculate the uncertainty of the neural network predictions in the three ways proposed in Gal's PhD thesis,
as presented at pag. 51-54:
- variation_ratio: defined in Eq. 3.19
- predictive_entropy: defined in Eq. 3.20
- mutual_information: defined at pag. 53 (no Eq. number)
.. codeauthors:: Angelo Ziletti <angelo.ziletti@gmail.com>, Andreas Leitherer <andreas.leitherer@gmail.com>
"""
logger.info("Calculating classification uncertainty.")
# Faster implementation:
repeated_data = np.concatenate([data for _ in range(n_iter)])
result = model.predict(repeated_data, verbose=0)
results = np.array(np.array_split(result, n_iter))
labels = results.argmax(axis=-1)
# previous implementation:
# run through each point iteratively:
"""
labels = []
results = []
for idx_iter in range(n_iter):
if (idx_iter % (int(n_iter) / 10 + 1)) == 0:
logger.info("Performing forward pass: {0}/{1}".format(idx_iter + 1, n_iter))
result = model.predict(data, verbose=0)
label = result.argmax(axis=-1)
labels.append(label)
results.append(result)
"""
results = np.asarray(results)
# less memory intensive:
#if len(results)>25000*1000:
# dtype_ = np.float16
#else:
# dtype_ = None
#results = np.asarray(results, dtype=dtype_)
prediction = results.mean(axis=0)
if model_type == 'regression':
predictive_variance = results.var(axis=0)
uncertainty = dict(predictive_variance=predictive_variance)
elif model_type == 'classification':
# variation ratio
#print(labels)
mode, mode_count = stats.mode(np.asarray(labels), keepdims=True)
#print(mode)
#print(mode_count)
variation_ratio = np.transpose(1. - mode_count.mean(axis=0) / float(n_iter))
#print(variation_ratio)
# predictive entropy
# clip values to 1e-12 to avoid divergency in the log
prediction = np.clip(prediction, a_min=1e-12, a_max=None, out=prediction)
log_p_class = np.log2(prediction)
entropy_all_iteration = - np.multiply(prediction, log_p_class)
predictive_entropy = np.sum(entropy_all_iteration, axis=1)
# mutual information
# clip values to 1e-12 to avoid divergency in the log
results = np.clip(results, a_min=1e-12, a_max=None, out=results)
p_log_p_all = np.multiply(np.log2(results), results)
exp_p_omega = np.sum(np.sum(p_log_p_all, axis=0), axis=1)
mutual_information = predictive_entropy + 1. / float(n_iter) * exp_p_omega
uncertainty = dict(variation_ratio=variation_ratio, predictive_entropy=predictive_entropy,
mutual_information=mutual_information)
else:
raise ValueError("Supported model types are 'classification' or 'regression'."
"model_type={} is not accepted.".format(model_type))
return prediction, uncertainty
[docs]def reshape_images(images, target_shape):
"""Reshape images according to the target shape"""
images = np.reshape(images, target_shape)
logger.debug("Reshaping to {}".format(target_shape))
return images
[docs]def normalize_images(images):
input_dims = (images.shape[1], images.shape[2])
if len(input_dims) == 2:
for idx in range(images.shape[0]):
images[idx, :, :, :] = (images[idx, :, :, :] - np.amin(images[idx, :, :, :])) / (
np.amax(images[idx, :, :, :]) - np.amin(images[idx, :, :, :]))
else:
for idx in range(images.shape[0]):
images[idx, :, :, :] = (images[idx, :, :, :, :] - np.amin(images[idx, :, :, :, :])) / (
np.amax(images[idx, :, :, :, :]) - np.amin(images[idx, :, :, :, :]))
return images